Ejemplo n.º 1
0
 def test_true_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A / B
     assert_array_equal(
         A_rdd.true_divide(B).toarray(), np_res
     )
Ejemplo n.º 2
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
Ejemplo n.º 3
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
Ejemplo n.º 4
0
 def test_remainder(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.remainder(A, B)
     assert_array_equal(
         A_rdd.remainder(B).toarray(), np_res
     )
Ejemplo n.º 5
0
    def test_same_classes(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000, ))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(local.classes_, dist.classes_)
Ejemplo n.º 6
0
    def test_same_fit_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,))

        local = LabelEncoder()
        dist = SparkLabelEncoder()

        assert_array_equal(local.fit_transform(Y), dist.fit_transform(Y_rdd).toarray())
Ejemplo n.º 7
0
    def test_same_classes(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(local.classes_, dist.classes_)
Ejemplo n.º 8
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{
            'min_df': .5
        }, {
            'min_df': 2,
            'max_df': .9
        }, {
            'min_df': 1,
            'max_df': .6
        }, {
            'min_df': 2,
            'max_features': 3
        }]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)
Ejemplo n.º 9
0
 def test_fmod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.fmod(A, B)
     assert_array_equal(
         A_rdd.fmod(B).toarray(), np_res
     )
Ejemplo n.º 10
0
    def test_same_fit_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000, ))

        local = LabelEncoder()
        dist = SparkLabelEncoder()

        assert_array_equal(local.fit_transform(Y),
                           dist.fit_transform(Y_rdd).toarray())
Ejemplo n.º 11
0
    def test_transform(self):
        X, X_rdd = self.make_dense_rdd((100, 4))

        fn = lambda x: x**2
        X1 = list(map(fn, X_rdd.collect()))
        X2 = X_rdd.transform(fn).collect()

        assert_array_equal(X1, X2)
Ejemplo n.º 12
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
Ejemplo n.º 13
0
    def test_transform(self):
        X, X_rdd = self.make_dense_rdd((100, 4))

        fn = lambda x: x ** 2
        X1 = list(map(fn, X_rdd.collect()))
        X2 = X_rdd.transform(fn).collect()

        assert_array_equal(X1, X2)
Ejemplo n.º 14
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
Ejemplo n.º 15
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
Ejemplo n.º 16
0
    def test_transform(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        fn = lambda x: x ** 2
        X1 = map(fn, X.collect())
        X2 = X.transform(fn).collect()

        assert_array_equal(X1, X2)
Ejemplo n.º 17
0
    def test_same_output(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer()
        dist = SparkDictVectorizer()

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
Ejemplo n.º 18
0
    def test_same_inverse_transform(self):
        Y, Y_rdd = self.make_dense_randint_rdd((1000,), low_high=(0, 10))

        local = LabelEncoder().fit(Y)
        dist = SparkLabelEncoder().fit(Y_rdd)

        assert_array_equal(
            local.inverse_transform(Y),
            dist.inverse_transform(Y_rdd).toarray()
        )
Ejemplo n.º 19
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
Ejemplo n.º 21
0
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
Ejemplo n.º 22
0
    def test_convert_toarray(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_array = X.toarray()
        assert_array_equal(X_array, data)

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_array = X.toarray()
        assert_array_equal(X_array, np.array(data))
Ejemplo n.º 23
0
    def test_convert_toarray(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_array = X.toarray()
        assert_array_equal(X_array, data)

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_array = X.toarray()
        assert_array_equal(X_array, np.array(data))
Ejemplo n.º 24
0
    def test_sum(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data)
        assert_equal(ArrayRDD(rdd).sum(), data.sum())
        assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0))
        assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1))

        data = np.arange(600).reshape((100, 3, 2))
        rdd = self.sc.parallelize(data)
        assert_equal(ArrayRDD(rdd).sum(), data.sum())
        assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0))
        assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1))
        assert_array_equal(ArrayRDD(rdd).sum(axis=2), data.sum(axis=2))
Ejemplo n.º 25
0
    def test_dummy_analyzer(self):
        X, X_rdd = self.make_text_rdd()

        def splitter(x):
            return x.split()
        X = map(splitter, X)
        X_rdd = X_rdd.map(lambda x: map(splitter, x))

        local = HashingVectorizer(analyzer=lambda x: x)
        dist = SparkHashingVectorizer(analyzer=lambda x: x)

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
Ejemplo n.º 26
0
    def test_same_fit_transforms(self):
        X, X_rdd = self.make_dense_rdd((1e3, 12))

        n_components = 4
        random_state = 42
        tol = 1e-7
        local = TruncatedSVD(n_components, n_iter=5, tol=tol,
                             random_state=random_state)
        dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol,
                                 random_state=random_state)

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd).toarray()

        tol = 1e-1
        assert_array_equal(Z_local.shape, Z_dist.shape)
        assert(np.allclose(+Z_dist[:, 0], Z_local[:, 0], atol=tol) |
               np.allclose(-Z_dist[:, 0], Z_local[:, 0], atol=tol))
Ejemplo n.º 27
0
    def test_get_multiple_items(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)),
                    (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))]
        assert_array_equal(z[:2, 1].collect(),
                           [expected[0][1], expected[1][1]])
        assert_array_equal(z[[0, 1], 0].collect(),
                           [expected[0][0], expected[1][0]])
        assert_multiple_tuples_equal(z[[0, 1], [1]].collect(),
                                     [(expected[0][1], ), (expected[1][1], )])
        assert_multiple_tuples_equal(z[[0, 1], -1:].collect(),
                                     [(expected[0][1], ), (expected[1][1], )])
        assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(),
                                     [expected[1][::-1], expected[0][::-1]])
Ejemplo n.º 28
0
    def test_dummy_analyzer(self):
        X, X_rdd = self.make_text_rdd()

        def splitter(x):
            return x.split()

        X = list(map(splitter, X))
        X_rdd = X_rdd.map(lambda x: list(map(splitter, x)))

        local = HashingVectorizer(analyzer=lambda x: x)
        dist = SparkHashingVectorizer(analyzer=lambda x: x)

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
Ejemplo n.º 29
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X)
            result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local.toarray(), result_dist.toarray())

            result_dist = sp.vstack(dist.transform(X_rdd).collect())
            assert_array_equal(result_local.toarray(), result_dist.toarray())
Ejemplo n.º 30
0
    def test_same_fit_transforms(self):
        X, X_rdd = self.make_dense_rdd((1e3, 12))

        n_components = 4
        random_state = 42
        tol = 1e-7
        local = TruncatedSVD(n_components, n_iter=5, tol=tol,
                             random_state=random_state)
        dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol,
                                 random_state=random_state)

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)
        Z_collected = Z_dist.toarray()
        assert_true(check_rdd_dtype(Z_dist, (np.ndarray,)))

        tol = 1e-1
        assert_array_equal(Z_local.shape, Z_collected.shape)
        assert(np.allclose(+Z_collected[:, 0], Z_local[:, 0], atol=tol) |
               np.allclose(-Z_collected[:, 0], Z_local[:, 0], atol=tol))
Ejemplo n.º 31
0
    def test_get_multiple_items(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)),
                    (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))]
        assert_array_equal(z[:2, 1].collect(),
                           [expected[0][1], expected[1][1]])
        assert_array_equal(z[[0, 1], 0].collect(),
                           [expected[0][0], expected[1][0]])
        assert_multiple_tuples_equal(z[[0, 1], [1]].collect(),
                                     [(expected[0][1],),
                                      (expected[1][1],)])
        assert_multiple_tuples_equal(z[[0, 1], -1:].collect(),
                                     [(expected[0][1],),
                                      (expected[1][1],)])
        assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(),
                                     [expected[1][::-1], expected[0][::-1]])
Ejemplo n.º 32
0
 def test_subtract(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A - B
     assert_array_equal(A_rdd.subtract(B).toarray(), np_res)
     assert_array_equal((A_rdd - B).toarray(), np_res)
     A_rdd -= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 33
0
 def test_multiply(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A * B
     assert_array_equal(A_rdd.multiply(B).toarray(), np_res)
     assert_array_equal((A_rdd * B).toarray(), np_res)
     A_rdd *= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 34
0
 def test_power(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A**B
     assert_array_equal(A_rdd.power(B).toarray(), np_res)
     assert_array_equal((A_rdd**B).toarray(), np_res)
     A_rdd **= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 35
0
 def test_floor_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A // B
     assert_array_equal(A_rdd.floor_divide(B).toarray(), np_res)
     assert_array_equal((A_rdd // B).toarray(), np_res)
     A_rdd //= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 36
0
 def test_add(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A + B
     assert_array_equal(A_rdd.add(B).toarray(), np_res)
     assert_array_equal((A_rdd + B).toarray(), np_res)
     A_rdd += B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 37
0
 def test_mod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A % B
     assert_array_equal(A_rdd.mod(B).toarray(), np_res)
     assert_array_equal((A_rdd % B).toarray(), np_res)
     A_rdd %= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 38
0
 def test_floor_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A // B
     assert_array_equal(
         A_rdd.floor_divide(B).toarray(), np_res
     )
     assert_array_equal((A_rdd // B).toarray(), np_res)
     A_rdd //= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 39
0
 def test_add(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A + B
     assert_array_equal(
         A_rdd.add(B).toarray(), np_res
     )
     assert_array_equal((A_rdd + B).toarray(), np_res)
     A_rdd += B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 40
0
 def test_multiply(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A * B
     assert_array_equal(
         A_rdd.multiply(B).toarray(), np_res
     )
     assert_array_equal((A_rdd * B).toarray(), np_res)
     A_rdd *= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 41
0
 def test_subtract(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A - B
     assert_array_equal(
         A_rdd.subtract(B).toarray(), np_res
     )
     assert_array_equal((A_rdd - B).toarray(), np_res)
     A_rdd -= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 42
0
 def test_power(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A ** B
     assert_array_equal(
         A_rdd.power(B).toarray(), np_res
     )
     assert_array_equal((A_rdd ** B).toarray(), np_res)
     A_rdd **= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 43
0
 def test_mod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A % B
     assert_array_equal(
         A_rdd.mod(B).toarray(), np_res
     )
     assert_array_equal((A_rdd % B).toarray(), np_res)
     A_rdd %= B
     assert_array_equal(A_rdd.toarray(), np_res)
Ejemplo n.º 44
0
    def test_dtype(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize(["lorem" for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=list)
        assert_array_equal(["lorem"] * 10, blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(["lorem"] * 10, blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions)
        blocked_data = block(data, dtype=tuple)
        assert_array_equal(tuple([1] * (n_samples // n_partitions)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
Ejemplo n.º 45
0
    def test_dtype(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize(["lorem" for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=list)
        assert_array_equal(["lorem"] * 10, blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(["lorem"] * 10, blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([1 for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=tuple)
        assert_array_equal(tuple([1] * (n_samples / n_partitions)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
Ejemplo n.º 46
0
    def test_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((n_samples / n_partitions, 1)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
Ejemplo n.º 47
0
    def test_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((n_samples // n_partitions, 1)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
Ejemplo n.º 48
0
    def test_get_multiple_item(self):
        X, X_rdd = self.make_dense_range_rdd((100, 4), block_size=5)

        exp0th = np.arange(0, 20).reshape((5, 4))
        exp1st = np.arange(20, 40).reshape((5, 4))
        exp2nd = np.arange(40, 60).reshape((5, 4))
        exp7th = np.arange(140, 160).reshape((5, 4))
        exp18th = np.arange(360, 380).reshape((5, 4))
        exp19th = np.arange(380, 400).reshape((5, 4))

        assert_array_equal(X_rdd[[0, 1]].collect(), [exp0th, exp1st])
        assert_array_equal(X_rdd[[0, 2]].collect(), [exp0th, exp2nd])
        assert_array_equal(X_rdd[[0, -1]].collect(), [exp0th, exp19th])
        assert_array_equal(X_rdd[[0, -2]].collect(), [exp0th, exp18th])
        assert_array_equal(X_rdd[[1, -2]].collect(), [exp1st, exp18th])
        assert_array_equal(X_rdd[[7, 0]].collect(), [exp7th, exp0th])
        assert_array_equal(X_rdd[[1, 2, 7, 19]].collect(),
                           [exp1st, exp2nd, exp7th, exp19th])
Ejemplo n.º 49
0
    def test_get_single_item(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2)))
        assert_array_equal(z[0, 1].first(), np.arange(5))

        assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2)))
        assert_array_equal(z[3, 1].first(), np.arange(15, 20))
        # assert_array_equal(z[3, -1].first(), np.arange(15, 20))

        assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[7, 1].first(), np.arange(35, 40))
Ejemplo n.º 50
0
 def test_flatten(self):
     X, X_rdd = self.make_dense_rdd((100, 3, 2))
     X = X.flatten()
     X_rdd = X_rdd.flatten()
     assert_array_equal(X_rdd.toarray(), X)
Ejemplo n.º 51
0
 def test_remainder(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.remainder(A, B)
     assert_array_equal(A_rdd.remainder(B).toarray(), np_res)
Ejemplo n.º 52
0
 def test_fmod(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = np.fmod(A, B)
     assert_array_equal(A_rdd.fmod(B).toarray(), np_res)
Ejemplo n.º 53
0
    def test_array_slice_syntax(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        exp0th = np.arange(0, 20).reshape((5, 4))
        exp1st = np.arange(20, 40).reshape((5, 4))
        exp7th = np.arange(140, 160).reshape((5, 4))
        exp8th = np.arange(160, 180).reshape((5, 4))
        exp9th = np.arange(180, 200).reshape((5, 4))
        exp18th = np.arange(360, 380).reshape((5, 4))
        exp19th = np.arange(380, 400).reshape((5, 4))

        assert_array_equal(X[:1].collect(), [exp0th])
        assert_array_equal(X[:2].collect(), [exp0th, exp1st])
        assert_array_equal(X[18:].collect(), [exp18th, exp19th])
        assert_array_equal(X[-1:].collect(), [exp19th])
        assert_array_equal(X[-2:].collect(), [exp18th, exp19th])
        assert_array_equal(X[7:10].collect(), [exp7th, exp8th, exp9th])
        assert_array_equal(X[7:10:2].collect(), [exp7th, exp9th])
        assert_array_equal(X[::9].collect(), [exp0th, exp9th, exp18th])
        assert_array_equal(X[::-10].collect(), [exp19th, exp9th])
        assert_array_equal(X[-1:1].collect(), [])
Ejemplo n.º 54
0
 def test_true_divide(self):
     A, A_rdd = self.make_dense_rdd((8, 3))
     B, B_rdd = self.make_dense_rdd((1, 3))
     np_res = A / B
     assert_array_equal(A_rdd.true_divide(B).toarray(), np_res)
Ejemplo n.º 55
0
 def test_mean(self):
     data = np.arange(600).reshape((100, 3, 2))
     rdd = self.sc.parallelize(data)
     assert_equal(ArrayRDD(rdd).mean(), data.mean())
     assert_array_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0))
     assert_array_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))
Ejemplo n.º 56
0
    def test_get_single_item(self):
        x, y = np.arange(80).reshape((40, 2)), np.arange(40)
        x_rdd = self.sc.parallelize(x, 2)
        y_rdd = self.sc.parallelize(y, 2)
        z_rdd = x_rdd.zip(y_rdd)
        z = DictRDD(z_rdd, bsize=5)

        assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2)))
        assert_array_equal(z[0, 1].first(), np.arange(5))

        assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2)))
        assert_array_equal(z[3, 1].first(), np.arange(15, 20))
        # assert_array_equal(z[3, -1].first(), np.arange(15, 20))

        assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2)))
        assert_array_equal(z[7, 1].first(), np.arange(35, 40))
Ejemplo n.º 57
0
    def test_get_single_item(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        expected = np.arange(0, 20).reshape((5, 4))
        assert_array_equal(X.first(), expected)
        assert_array_equal(X[0].first(), expected)

        expected = np.arange(20, 40).reshape((5, 4))
        assert_array_equal(X[1].first(), expected)

        expected = np.arange(380, 400).reshape((5, 4))
        assert_array_equal(X[19].first(), expected)
        assert_array_equal(X[-1].first(), expected)

        expected = np.arange(340, 360).reshape((5, 4))
        assert_array_equal(X[17].first(), expected)
        assert_array_equal(X[-3].first(), expected)
Ejemplo n.º 58
0
 def test_dot(self):
     A, A_rdd = self.make_dense_rdd((20, 10))
     B, B_rdd = self.make_dense_rdd((10, 20))
     assert_array_equal(A_rdd.dot(B).toarray(), A.dot(B))
     assert_array_equal(B_rdd.dot(A).toarray(), B.dot(A))
Ejemplo n.º 59
0
    def test_array_slice_syntax(self):
        X, X_rdd = self.make_dense_range_rdd((100, 4), block_size=5)

        exp0th = np.arange(0, 20).reshape((5, 4))
        exp1st = np.arange(20, 40).reshape((5, 4))
        exp7th = np.arange(140, 160).reshape((5, 4))
        exp8th = np.arange(160, 180).reshape((5, 4))
        exp9th = np.arange(180, 200).reshape((5, 4))
        exp18th = np.arange(360, 380).reshape((5, 4))
        exp19th = np.arange(380, 400).reshape((5, 4))

        assert_array_equal(X_rdd[:1].collect(), [exp0th])
        assert_array_equal(X_rdd[:2].collect(), [exp0th, exp1st])
        assert_array_equal(X_rdd[18:].collect(), [exp18th, exp19th])
        assert_array_equal(X_rdd[-1:].collect(), [exp19th])
        assert_array_equal(X_rdd[-2:].collect(), [exp18th, exp19th])
        assert_array_equal(X_rdd[7:10].collect(), [exp7th, exp8th, exp9th])
        assert_array_equal(X_rdd[7:10:2].collect(), [exp7th, exp9th])
        assert_array_equal(X_rdd[::9].collect(), [exp0th, exp9th, exp18th])
        assert_array_equal(X_rdd[::-10].collect(), [exp19th, exp9th])
        assert_array_equal(X_rdd[-1:1].collect(), [])