コード例 #1
0
 def test_true_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A / B
     assert_array_equal(
         A_rdd.true_divide(B).toarray(), np_res
     )
コード例 #2
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
コード例 #3
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
コード例 #4
0
 def test_remainder(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.remainder(A, B)
     assert_array_equal(
         A_rdd.remainder(B).toarray(), np_res
     )
コード例 #5
0
ファイル: test_label.py プロジェクト: ziwei-SUE/sparkit-learn
    def test_same_classes(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000, ))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(local.classes_, dist.classes_)
コード例 #6
0
    def test_same_fit_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,))

        local = LabelEncoder()
        dist = SparkLabelEncoder()

        assert_array_equal(local.fit_transform(Y), dist.fit_transform(Y_rdd).toarray())
コード例 #7
0
    def test_same_classes(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(local.classes_, dist.classes_)
コード例 #8
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{
            'min_df': .5
        }, {
            'min_df': 2,
            'max_df': .9
        }, {
            'min_df': 1,
            'max_df': .6
        }, {
            'min_df': 2,
            'max_features': 3
        }]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)
コード例 #9
0
 def test_fmod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.fmod(A, B)
     assert_array_equal(
         A_rdd.fmod(B).toarray(), np_res
     )
コード例 #10
0
ファイル: test_label.py プロジェクト: ziwei-SUE/sparkit-learn
    def test_same_fit_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000, ))

        local = LabelEncoder()
        dist = SparkLabelEncoder()

        assert_array_equal(local.fit_transform(Y),
                           dist.fit_transform(Y_rdd).toarray())
コード例 #11
0
    def test_transform(self):
        X, X_rdd = self.make_dense_rdd((100, 4))

        fn = lambda x: x**2
        X1 = list(map(fn, X_rdd.collect()))
        X2 = X_rdd.transform(fn).collect()

        assert_array_equal(X1, X2)
コード例 #12
0
ファイル: test_text.py プロジェクト: HendryLi/sparkit-learn
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
コード例 #13
0
    def test_transform(self):
        X, X_rdd = self.make_dense_rdd((100, 4))

        fn = lambda x: x ** 2
        X1 = list(map(fn, X_rdd.collect()))
        X2 = X_rdd.transform(fn).collect()

        assert_array_equal(X1, X2)
コード例 #14
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
コード例 #15
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
コード例 #16
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_transform(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        fn = lambda x: x ** 2
        X1 = map(fn, X.collect())
        X2 = X.transform(fn).collect()

        assert_array_equal(X1, X2)
コード例 #17
0
    def test_same_output(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer()
        dist = SparkDictVectorizer()

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
コード例 #18
0
    def test_same_inverse_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd((1000,), low_high=(0, 10))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(
            local.inverse_transform(Y),
            dist.inverse_transform(Y_rdd).toarray()
        )
コード例 #19
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
コード例 #20
0
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
コード例 #21
0
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
コード例 #22
0
    def test_convert_toarray(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_array = X.toarray()
        assert_array_equal(X_array, data)

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_array = X.toarray()
        assert_array_equal(X_array, np.array(data))
コード例 #23
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_convert_toarray(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_array = X.toarray()
        assert_array_equal(X_array, data)

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_array = X.toarray()
        assert_array_equal(X_array, np.array(data))
コード例 #24
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_sum(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data)
        assert_equal(ArrayRDD(rdd).sum(), data.sum())
        assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0))
        assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1))

        data = np.arange(600).reshape((100, 3, 2))
        rdd = self.sc.parallelize(data)
        assert_equal(ArrayRDD(rdd).sum(), data.sum())
        assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0))
        assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1))
        assert_array_equal(ArrayRDD(rdd).sum(axis=2), data.sum(axis=2))
コード例 #25
0
ファイル: test_text.py プロジェクト: HendryLi/sparkit-learn
    def test_dummy_analyzer(self):
        X, X_rdd = self.make_text_rdd()

        def splitter(x):
            return x.split()
        X = map(splitter, X)
        X_rdd = X_rdd.map(lambda x: map(splitter, x))

        local = HashingVectorizer(analyzer=lambda x: x)
        dist = SparkHashingVectorizer(analyzer=lambda x: x)

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
コード例 #26
0
    def test_same_fit_transforms(self):
        X, X_rdd = self.make_dense_rdd((1e3, 12))

        n_components = 4
        random_state = 42
        tol = 1e-7
        local = TruncatedSVD(n_components, n_iter=5, tol=tol,
                             random_state=random_state)
        dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol,
                                 random_state=random_state)

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd).toarray()

        tol = 1e-1
        assert_array_equal(Z_local.shape, Z_dist.shape)
        assert(np.allclose(+Z_dist[:, 0], Z_local[:, 0], atol=tol) |
               np.allclose(-Z_dist[:, 0], Z_local[:, 0], atol=tol))
コード例 #27
0
    def test_get_multiple_items(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)),
                    (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))]
        assert_array_equal(z[:2, 1].collect(),
                           [expected[0][1], expected[1][1]])
        assert_array_equal(z[[0, 1], 0].collect(),
                           [expected[0][0], expected[1][0]])
        assert_multiple_tuples_equal(z[[0, 1], [1]].collect(),
                                     [(expected[0][1], ), (expected[1][1], )])
        assert_multiple_tuples_equal(z[[0, 1], -1:].collect(),
                                     [(expected[0][1], ), (expected[1][1], )])
        assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(),
                                     [expected[1][::-1], expected[0][::-1]])
コード例 #28
0
    def test_dummy_analyzer(self):
        X, X_rdd = self.make_text_rdd()

        def splitter(x):
            return x.split()

        X = list(map(splitter, X))
        X_rdd = X_rdd.map(lambda x: list(map(splitter, x)))

        local = HashingVectorizer(analyzer=lambda x: x)
        dist = SparkHashingVectorizer(analyzer=lambda x: x)

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
コード例 #29
0
ファイル: test_text.py プロジェクト: HendryLi/sparkit-learn
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X)
            result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local.toarray(), result_dist.toarray())

            result_dist = sp.vstack(dist.transform(X_rdd).collect())
            assert_array_equal(result_local.toarray(), result_dist.toarray())
コード例 #30
0
    def test_same_fit_transforms(self):
        X, X_rdd = self.make_dense_rdd((1e3, 12))

        n_components = 4
        random_state = 42
        tol = 1e-7
        local = TruncatedSVD(n_components, n_iter=5, tol=tol,
                             random_state=random_state)
        dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol,
                                 random_state=random_state)

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)
        Z_collected = Z_dist.toarray()
        assert_true(check_rdd_dtype(Z_dist, (np.ndarray,)))

        tol = 1e-1
        assert_array_equal(Z_local.shape, Z_collected.shape)
        assert(np.allclose(+Z_collected[:, 0], Z_local[:, 0], atol=tol) |
               np.allclose(-Z_collected[:, 0], Z_local[:, 0], atol=tol))
コード例 #31
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_get_multiple_items(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)),
                    (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))]
        assert_array_equal(z[:2, 1].collect(),
                           [expected[0][1], expected[1][1]])
        assert_array_equal(z[[0, 1], 0].collect(),
                           [expected[0][0], expected[1][0]])
        assert_multiple_tuples_equal(z[[0, 1], [1]].collect(),
                                     [(expected[0][1],),
                                      (expected[1][1],)])
        assert_multiple_tuples_equal(z[[0, 1], -1:].collect(),
                                     [(expected[0][1],),
                                      (expected[1][1],)])
        assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(),
                                     [expected[1][::-1], expected[0][::-1]])
コード例 #32
0
 def test_subtract(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A - B
     assert_array_equal(A_rdd.subtract(B).toarray(), np_res)
     assert_array_equal((A_rdd - B).toarray(), np_res)
     A_rdd -= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #33
0
 def test_multiply(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A * B
     assert_array_equal(A_rdd.multiply(B).toarray(), np_res)
     assert_array_equal((A_rdd * B).toarray(), np_res)
     A_rdd *= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #34
0
 def test_power(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A**B
     assert_array_equal(A_rdd.power(B).toarray(), np_res)
     assert_array_equal((A_rdd**B).toarray(), np_res)
     A_rdd **= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #35
0
 def test_floor_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A // B
     assert_array_equal(A_rdd.floor_divide(B).toarray(), np_res)
     assert_array_equal((A_rdd // B).toarray(), np_res)
     A_rdd //= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #36
0
 def test_add(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A + B
     assert_array_equal(A_rdd.add(B).toarray(), np_res)
     assert_array_equal((A_rdd + B).toarray(), np_res)
     A_rdd += B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #37
0
 def test_mod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A % B
     assert_array_equal(A_rdd.mod(B).toarray(), np_res)
     assert_array_equal((A_rdd % B).toarray(), np_res)
     A_rdd %= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #38
0
 def test_floor_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A // B
     assert_array_equal(
         A_rdd.floor_divide(B).toarray(), np_res
     )
     assert_array_equal((A_rdd // B).toarray(), np_res)
     A_rdd //= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #39
0
 def test_add(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A + B
     assert_array_equal(
         A_rdd.add(B).toarray(), np_res
     )
     assert_array_equal((A_rdd + B).toarray(), np_res)
     A_rdd += B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #40
0
 def test_multiply(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A * B
     assert_array_equal(
         A_rdd.multiply(B).toarray(), np_res
     )
     assert_array_equal((A_rdd * B).toarray(), np_res)
     A_rdd *= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #41
0
 def test_subtract(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A - B
     assert_array_equal(
         A_rdd.subtract(B).toarray(), np_res
     )
     assert_array_equal((A_rdd - B).toarray(), np_res)
     A_rdd -= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #42
0
 def test_power(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A ** B
     assert_array_equal(
         A_rdd.power(B).toarray(), np_res
     )
     assert_array_equal((A_rdd ** B).toarray(), np_res)
     A_rdd **= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #43
0
 def test_mod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A % B
     assert_array_equal(
         A_rdd.mod(B).toarray(), np_res
     )
     assert_array_equal((A_rdd % B).toarray(), np_res)
     A_rdd %= B
     assert_array_equal(A_rdd.toarray(), np_res)
コード例 #44
0
    def test_dtype(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize(["lorem" for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=list)
        assert_array_equal(["lorem"] * 10, blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(["lorem"] * 10, blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions)
        blocked_data = block(data, dtype=tuple)
        assert_array_equal(tuple([1] * (n_samples // n_partitions)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
コード例 #45
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_dtype(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize(["lorem" for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=list)
        assert_array_equal(["lorem"] * 10, blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(["lorem"] * 10, blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([1 for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=tuple)
        assert_array_equal(tuple([1] * (n_samples / n_partitions)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
コード例 #46
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((n_samples / n_partitions, 1)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
コード例 #47
0
    def test_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((n_samples // n_partitions, 1)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
コード例 #48
0
    def test_get_multiple_item(self):
        X, X_rdd = self.make_dense_range_rdd((100, 4), block_size=5)

        exp0th = np.arange(0, 20).reshape((5, 4))
        exp1st = np.arange(20, 40).reshape((5, 4))
        exp2nd = np.arange(40, 60).reshape((5, 4))
        exp7th = np.arange(140, 160).reshape((5, 4))
        exp18th = np.arange(360, 380).reshape((5, 4))
        exp19th = np.arange(380, 400).reshape((5, 4))

        assert_array_equal(X_rdd[[0, 1]].collect(), [exp0th, exp1st])
        assert_array_equal(X_rdd[[0, 2]].collect(), [exp0th, exp2nd])
        assert_array_equal(X_rdd[[0, -1]].collect(), [exp0th, exp19th])
        assert_array_equal(X_rdd[[0, -2]].collect(), [exp0th, exp18th])
        assert_array_equal(X_rdd[[1, -2]].collect(), [exp1st, exp18th])
        assert_array_equal(X_rdd[[7, 0]].collect(), [exp7th, exp0th])
        assert_array_equal(X_rdd[[1, 2, 7, 19]].collect(),
                           [exp1st, exp2nd, exp7th, exp19th])
コード例 #49
0
    def test_get_single_item(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2)))
        assert_array_equal(z[0, 1].first(), np.arange(5))

        assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2)))
        assert_array_equal(z[3, 1].first(), np.arange(15, 20))
        # assert_array_equal(z[3, -1].first(), np.arange(15, 20))

        assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[7, 1].first(), np.arange(35, 40))
コード例 #50
0
 def test_flatten(self):
     X, X_rdd = self.make_dense_rdd((100, 3, 2))
     X = X.flatten()
     X_rdd = X_rdd.flatten()
     assert_array_equal(X_rdd.toarray(), X)
コード例 #51
0
 def test_remainder(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.remainder(A, B)
     assert_array_equal(A_rdd.remainder(B).toarray(), np_res)
コード例 #52
0
 def test_fmod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.fmod(A, B)
     assert_array_equal(A_rdd.fmod(B).toarray(), np_res)
コード例 #53
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_array_slice_syntax(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        exp0th = np.arange(0, 20).reshape((5, 4))
        exp1st = np.arange(20, 40).reshape((5, 4))
        exp7th = np.arange(140, 160).reshape((5, 4))
        exp8th = np.arange(160, 180).reshape((5, 4))
        exp9th = np.arange(180, 200).reshape((5, 4))
        exp18th = np.arange(360, 380).reshape((5, 4))
        exp19th = np.arange(380, 400).reshape((5, 4))

        assert_array_equal(X[:1].collect(), [exp0th])
        assert_array_equal(X[:2].collect(), [exp0th, exp1st])
        assert_array_equal(X[18:].collect(), [exp18th, exp19th])
        assert_array_equal(X[-1:].collect(), [exp19th])
        assert_array_equal(X[-2:].collect(), [exp18th, exp19th])
        assert_array_equal(X[7:10].collect(), [exp7th, exp8th, exp9th])
        assert_array_equal(X[7:10:2].collect(), [exp7th, exp9th])
        assert_array_equal(X[::9].collect(), [exp0th, exp9th, exp18th])
        assert_array_equal(X[::-10].collect(), [exp19th, exp9th])
        assert_array_equal(X[-1:1].collect(), [])
コード例 #54
0
 def test_true_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A / B
     assert_array_equal(A_rdd.true_divide(B).toarray(), np_res)
コード例 #55
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
 def test_mean(self):
     data = np.arange(600).reshape((100, 3, 2))
     rdd = self.sc.parallelize(data)
     assert_equal(ArrayRDD(rdd).mean(), data.mean())
     assert_array_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0))
     assert_array_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))
コード例 #56
0
ファイル: test_rdd.py プロジェクト: schevalier/sparkit-learn
    def test_get_single_item(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2)))
        assert_array_equal(z[0, 1].first(), np.arange(5))

        assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2)))
        assert_array_equal(z[3, 1].first(), np.arange(15, 20))
        # assert_array_equal(z[3, -1].first(), np.arange(15, 20))

        assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[7, 1].first(), np.arange(35, 40))
コード例 #57
0
    def test_get_single_item(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        expected = np.arange(0, 20).reshape((5, 4))
        assert_array_equal(X.first(), expected)
        assert_array_equal(X[0].first(), expected)

        expected = np.arange(20, 40).reshape((5, 4))
        assert_array_equal(X[1].first(), expected)

        expected = np.arange(380, 400).reshape((5, 4))
        assert_array_equal(X[19].first(), expected)
        assert_array_equal(X[-1].first(), expected)

        expected = np.arange(340, 360).reshape((5, 4))
        assert_array_equal(X[17].first(), expected)
        assert_array_equal(X[-3].first(), expected)
コード例 #58
0
 def test_dot(self):
     A, A_rdd = self.make_dense_rdd((20, 10))
     B, B_rdd = self.make_dense_rdd((10, 20))
     assert_array_equal(A_rdd.dot(B).toarray(), A.dot(B))
     assert_array_equal(B_rdd.dot(A).toarray(), B.dot(A))
コード例 #59
0
    def test_array_slice_syntax(self):
        X, X_rdd = self.make_dense_range_rdd((100, 4), block_size=5)

        exp0th = np.arange(0, 20).reshape((5, 4))
        exp1st = np.arange(20, 40).reshape((5, 4))
        exp7th = np.arange(140, 160).reshape((5, 4))
        exp8th = np.arange(160, 180).reshape((5, 4))
        exp9th = np.arange(180, 200).reshape((5, 4))
        exp18th = np.arange(360, 380).reshape((5, 4))
        exp19th = np.arange(380, 400).reshape((5, 4))

        assert_array_equal(X_rdd[:1].collect(), [exp0th])
        assert_array_equal(X_rdd[:2].collect(), [exp0th, exp1st])
        assert_array_equal(X_rdd[18:].collect(), [exp18th, exp19th])
        assert_array_equal(X_rdd[-1:].collect(), [exp19th])
        assert_array_equal(X_rdd[-2:].collect(), [exp18th, exp19th])
        assert_array_equal(X_rdd[7:10].collect(), [exp7th, exp8th, exp9th])
        assert_array_equal(X_rdd[7:10:2].collect(), [exp7th, exp9th])
        assert_array_equal(X_rdd[::9].collect(), [exp0th, exp9th, exp18th])
        assert_array_equal(X_rdd[::-10].collect(), [exp19th, exp9th])
        assert_array_equal(X_rdd[-1:1].collect(), [])