Esempio n. 1
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
Esempio n. 2
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
Esempio n. 3
0
    def test_unblock(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), range(1000))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), range(1000))
Esempio n. 4
0
    def test_unblock(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), list(range(1000)))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), list(range(1000)))
Esempio n. 5
0
    def test_creation(self):
        rdd = self.generate()

        blocked = BlockRDD(rdd)
        assert_is_instance(blocked, BlockRDD)
        assert_equal(blocked.first(), range(10))
        assert_equal(blocked.collect(), np.arange(100).reshape(10, 10).tolist())

        blocked = BlockRDD(rdd, bsize=4)
        assert_is_instance(blocked, BlockRDD)
        assert_equal(blocked.first(), range(4))
        assert_equal([len(x) for x in blocked.collect()], [4, 4, 2] * 10)
Esempio n. 6
0
    def test_convert_tolist(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data.tolist())

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data)
Esempio n. 7
0
    def test_convert_tolist(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data.tolist())

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data)
Esempio n. 8
0
    def test_creation_from_blocked_rdds(self):
        x, y, z = np.arange(80).reshape((40, 2)), np.arange(40), range(40)
        x_rdd = ArrayRDD(self.sc.parallelize(x, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list)

        expected = (np.arange(20).reshape(10, 2), np.arange(10), range(10))
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
Esempio n. 9
0
    def test_creation(self):
        rdd = self.generate()

        blocked = BlockRDD(rdd)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(10))
        assert_equal(blocked.first(), expected)
        expected = [tuple(v) for v in np.arange(100).reshape(10, 10)]
        assert_equal(blocked.collect(), expected)

        blocked = BlockRDD(rdd, bsize=4)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(4))
        assert_equal(blocked.first(), expected)
        expected = [4, 4, 2] * 10
        assert_equal([len(x) for x in blocked.collect()], expected)
Esempio n. 10
0
    def test_creation(self):
        rdd = self.generate()

        blocked = BlockRDD(rdd)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(10))
        assert_equal(blocked.first(), expected)
        expected = [tuple(v) for v in np.arange(100).reshape(10, 10)]
        assert_equal(blocked.collect(), expected)

        blocked = BlockRDD(rdd, bsize=4)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(4))
        assert_equal(blocked.first(), expected)
        expected = [4, 4, 2] * 10
        assert_equal([len(x) for x in blocked.collect()], expected)
Esempio n. 11
0
    def test_creation_from_zipped_rdd(self):
        x = np.arange(80).reshape((40, 2))
        y = range(40)
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        zipped_rdd = x_rdd.zip(y_rdd)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)))

        rdd = DictRDD(zipped_rdd)
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, columns=('x', 'y'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[1], list)
Esempio n. 12
0
    def test_creation_from_zipped_rdd(self):
        x = np.arange(80).reshape((40, 2))
        y = range(40)
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        zipped_rdd = x_rdd.zip(y_rdd)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)))

        rdd = DictRDD(zipped_rdd)
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, columns=('x', 'y'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[1], list)
Esempio n. 13
0
 def test_dtypes(self):
     rdd = self.generate()
     blocked = BlockRDD(rdd, dtype=list)
     assert_is_instance(blocked.first(), list)
     blocked = BlockRDD(rdd, dtype=tuple)
     assert_is_instance(blocked.first(), tuple)
     blocked = BlockRDD(rdd, dtype=set)
     assert_is_instance(blocked.first(), set)
     blocked = BlockRDD(rdd, dtype=np.array)
     assert_is_instance(blocked.first(), np.ndarray)
Esempio n. 14
0
 def test_dtypes(self):
     rdd = self.generate()
     blocked = BlockRDD(rdd, dtype=list)
     assert_is_instance(blocked.first(), list)
     blocked = BlockRDD(rdd, dtype=tuple)
     assert_is_instance(blocked.first(), tuple)
     blocked = BlockRDD(rdd, dtype=set)
     assert_is_instance(blocked.first(), set)
     blocked = BlockRDD(rdd, dtype=np.array)
     assert_is_instance(blocked.first(), np.ndarray)
Esempio n. 15
0
    def test_creation_from_blocked_rdds(self):
        x = np.arange(80).reshape((40, 2))
        y = np.arange(40)
        z = list(range(40))
        x_rdd = ArrayRDD(self.sc.parallelize(x, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list)

        expected = (np.arange(20).reshape(10,
                                          2), np.arange(10), list(range(10)))
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
Esempio n. 16
0
    def test_creation_from_rdds(self):
        x = np.arange(80).reshape((40, 2))
        y = np.arange(40)
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (
            np.arange(20).reshape(10, 2),
            np.arange(10), list(range(10))
        )
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd],
                      dtype=(np.ndarray, np.ndarray, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
Esempio n. 17
0
    def test_transform_dtype(self):
        X, X_rdd = self.make_dense_rdd((100, 4))

        rdd = X_rdd.transform(lambda x: x)
        assert_is_instance(rdd, ArrayRDD)
        rdd = X_rdd.transform(lambda x: x.tolist(), dtype=list)
        assert_is_instance(rdd, BlockRDD)
        rdd = X_rdd.transform(lambda x: sp.lil_matrix(x), dtype=sp.spmatrix)
        assert_is_instance(rdd, SparseRDD)
Esempio n. 18
0
    def test_transform_dtype(self):
        X, X_rdd = self.make_dense_rdd((100, 4))

        rdd = X_rdd.transform(lambda x: x)
        assert_is_instance(rdd, ArrayRDD)
        rdd = X_rdd.transform(lambda x: x.tolist(), dtype=list)
        assert_is_instance(rdd, BlockRDD)
        rdd = X_rdd.transform(lambda x: sp.lil_matrix(x), dtype=sp.spmatrix)
        assert_is_instance(rdd, SparseRDD)
Esempio n. 19
0
    def test_initialization(self):
        n_partitions = 4
        n_samples = 100

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_raises(TypeError, ArrayRDD, data)
        assert_raises(TypeError, ArrayRDD, data, False)
        assert_raises(TypeError, ArrayRDD, data, 10)

        assert_is_instance(ArrayRDD(rdd), ArrayRDD)
        assert_is_instance(ArrayRDD(rdd, 10), ArrayRDD)
        assert_is_instance(ArrayRDD(rdd, None), ArrayRDD)
Esempio n. 20
0
    def test_initialization(self):
        n_partitions = 4
        n_samples = 100

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_raises(TypeError, ArrayRDD, data)
        assert_raises(TypeError, ArrayRDD, data, False)
        assert_raises(TypeError, ArrayRDD, data, 10)

        assert_is_instance(ArrayRDD(rdd), ArrayRDD)
        assert_is_instance(ArrayRDD(rdd, 10), ArrayRDD)
        assert_is_instance(ArrayRDD(rdd, None), ArrayRDD)
Esempio n. 21
0
    def test_tolist(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, list(range(1000)))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, list(range(1000)))

        blocked = BlockRDD(self.generate(1000, 5), dtype=np.array)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, list(range(1000)))
Esempio n. 22
0
    def test_tolist(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, range(1000))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, range(1000))

        blocked = BlockRDD(self.generate(1000, 5), dtype=np.array)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, range(1000))
Esempio n. 23
0
    def test_initialization(self):
        n_partitions = 4
        n_samples = 100

        data = [(1, 2) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_raises(TypeError, DictRDD, data)
        assert_raises(TypeError, DictRDD, data, bsize=False)
        assert_raises(TypeError, DictRDD, data, bsize=10)

        assert_is_instance(DictRDD(rdd), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
        assert_is_instance(DictRDD(rdd, bsize=10), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
        assert_is_instance(DictRDD(rdd, bsize=None), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
Esempio n. 24
0
    def test_initialization(self):
        n_partitions = 4
        n_samples = 100

        data = [(1, 2) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_raises(TypeError, DictRDD, data)
        assert_raises(TypeError, DictRDD, data, bsize=False)
        assert_raises(TypeError, DictRDD, data, bsize=10)

        assert_is_instance(DictRDD(rdd), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
        assert_is_instance(DictRDD(rdd, bsize=10), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)
        assert_is_instance(DictRDD(rdd, bsize=None), DictRDD)
        assert_is_instance(DictRDD(rdd), BlockRDD)