def test_unblocking_rdd(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_unblocked = X.unblock() assert_is_instance(X_unblocked, RDD) assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
def test_unblocking_rdd(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_unblocked = X.unblock() assert_is_instance(X_unblocked, RDD) assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
def test_unblock(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), range(1000)) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), range(1000))
def test_unblock(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), list(range(1000))) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), list(range(1000)))
def test_creation(self): rdd = self.generate() blocked = BlockRDD(rdd) assert_is_instance(blocked, BlockRDD) assert_equal(blocked.first(), range(10)) assert_equal(blocked.collect(), np.arange(100).reshape(10, 10).tolist()) blocked = BlockRDD(rdd, bsize=4) assert_is_instance(blocked, BlockRDD) assert_equal(blocked.first(), range(4)) assert_equal([len(x) for x in blocked.collect()], [4, 4, 2] * 10)
def test_convert_tolist(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data.tolist()) data = [2, 3, 5, 1, 6, 7, 9, 9] rdd = self.sc.parallelize(data, 2) X = ArrayRDD(rdd) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data)
def test_convert_tolist(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data.tolist()) data = [2, 3, 5, 1, 6, 7, 9, 9] rdd = self.sc.parallelize(data, 2) X = ArrayRDD(rdd) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data)
def test_creation_from_blocked_rdds(self): x, y, z = np.arange(80).reshape((40, 2)), np.arange(40), range(40) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), range(10)) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_creation(self): rdd = self.generate() blocked = BlockRDD(rdd) assert_is_instance(blocked, BlockRDD) expected = tuple(range(10)) assert_equal(blocked.first(), expected) expected = [tuple(v) for v in np.arange(100).reshape(10, 10)] assert_equal(blocked.collect(), expected) blocked = BlockRDD(rdd, bsize=4) assert_is_instance(blocked, BlockRDD) expected = tuple(range(4)) assert_equal(blocked.first(), expected) expected = [4, 4, 2] * 10 assert_equal([len(x) for x in blocked.collect()], expected)
def test_creation(self): rdd = self.generate() blocked = BlockRDD(rdd) assert_is_instance(blocked, BlockRDD) expected = tuple(range(10)) assert_equal(blocked.first(), expected) expected = [tuple(v) for v in np.arange(100).reshape(10, 10)] assert_equal(blocked.collect(), expected) blocked = BlockRDD(rdd, bsize=4) assert_is_instance(blocked, BlockRDD) expected = tuple(range(4)) assert_equal(blocked.first(), expected) expected = [4, 4, 2] * 10 assert_equal([len(x) for x in blocked.collect()], expected)
def test_creation_from_zipped_rdd(self): x = np.arange(80).reshape((40, 2)) y = range(40) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) zipped_rdd = x_rdd.zip(y_rdd) expected = (np.arange(20).reshape(10, 2), tuple(range(10))) rdd = DictRDD(zipped_rdd) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, columns=('x', 'y')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[1], list)
def test_creation_from_zipped_rdd(self): x = np.arange(80).reshape((40, 2)) y = range(40) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) zipped_rdd = x_rdd.zip(y_rdd) expected = (np.arange(20).reshape(10, 2), tuple(range(10))) rdd = DictRDD(zipped_rdd) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, columns=('x', 'y')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[1], list)
def test_dtypes(self): rdd = self.generate() blocked = BlockRDD(rdd, dtype=list) assert_is_instance(blocked.first(), list) blocked = BlockRDD(rdd, dtype=tuple) assert_is_instance(blocked.first(), tuple) blocked = BlockRDD(rdd, dtype=set) assert_is_instance(blocked.first(), set) blocked = BlockRDD(rdd, dtype=np.array) assert_is_instance(blocked.first(), np.ndarray)
def test_dtypes(self): rdd = self.generate() blocked = BlockRDD(rdd, dtype=list) assert_is_instance(blocked.first(), list) blocked = BlockRDD(rdd, dtype=tuple) assert_is_instance(blocked.first(), tuple) blocked = BlockRDD(rdd, dtype=set) assert_is_instance(blocked.first(), set) blocked = BlockRDD(rdd, dtype=np.array) assert_is_instance(blocked.first(), np.ndarray)
def test_creation_from_blocked_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_creation_from_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = ( np.arange(20).reshape(10, 2), np.arange(10), list(range(10)) ) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(np.ndarray, np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_transform_dtype(self): X, X_rdd = self.make_dense_rdd((100, 4)) rdd = X_rdd.transform(lambda x: x) assert_is_instance(rdd, ArrayRDD) rdd = X_rdd.transform(lambda x: x.tolist(), dtype=list) assert_is_instance(rdd, BlockRDD) rdd = X_rdd.transform(lambda x: sp.lil_matrix(x), dtype=sp.spmatrix) assert_is_instance(rdd, SparseRDD)
def test_transform_dtype(self): X, X_rdd = self.make_dense_rdd((100, 4)) rdd = X_rdd.transform(lambda x: x) assert_is_instance(rdd, ArrayRDD) rdd = X_rdd.transform(lambda x: x.tolist(), dtype=list) assert_is_instance(rdd, BlockRDD) rdd = X_rdd.transform(lambda x: sp.lil_matrix(x), dtype=sp.spmatrix) assert_is_instance(rdd, SparseRDD)
def test_initialization(self): n_partitions = 4 n_samples = 100 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_raises(TypeError, ArrayRDD, data) assert_raises(TypeError, ArrayRDD, data, False) assert_raises(TypeError, ArrayRDD, data, 10) assert_is_instance(ArrayRDD(rdd), ArrayRDD) assert_is_instance(ArrayRDD(rdd, 10), ArrayRDD) assert_is_instance(ArrayRDD(rdd, None), ArrayRDD)
def test_initialization(self): n_partitions = 4 n_samples = 100 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_raises(TypeError, ArrayRDD, data) assert_raises(TypeError, ArrayRDD, data, False) assert_raises(TypeError, ArrayRDD, data, 10) assert_is_instance(ArrayRDD(rdd), ArrayRDD) assert_is_instance(ArrayRDD(rdd, 10), ArrayRDD) assert_is_instance(ArrayRDD(rdd, None), ArrayRDD)
def test_tolist(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, list(range(1000))) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, list(range(1000))) blocked = BlockRDD(self.generate(1000, 5), dtype=np.array) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, list(range(1000)))
def test_tolist(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, range(1000)) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, range(1000)) blocked = BlockRDD(self.generate(1000, 5), dtype=np.array) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, range(1000))
def test_initialization(self): n_partitions = 4 n_samples = 100 data = [(1, 2) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_raises(TypeError, DictRDD, data) assert_raises(TypeError, DictRDD, data, bsize=False) assert_raises(TypeError, DictRDD, data, bsize=10) assert_is_instance(DictRDD(rdd), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD) assert_is_instance(DictRDD(rdd, bsize=10), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD) assert_is_instance(DictRDD(rdd, bsize=None), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD)
def test_initialization(self): n_partitions = 4 n_samples = 100 data = [(1, 2) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_raises(TypeError, DictRDD, data) assert_raises(TypeError, DictRDD, data, bsize=False) assert_raises(TypeError, DictRDD, data, bsize=10) assert_is_instance(DictRDD(rdd), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD) assert_is_instance(DictRDD(rdd, bsize=10), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD) assert_is_instance(DictRDD(rdd, bsize=None), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD)