def test_ndim(self): data = np.arange(4000) shapes = [(4000), (1000, 4), (200, 10, 2), (100, 10, 2, 2)] for shape in shapes: reshaped = data.reshape(shape) rdd = self.sc.parallelize(reshaped) assert_equal(ArrayRDD(rdd).ndim, reshaped.ndim)
def test_shape(self): data = np.arange(4000) shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)] for shape in shapes: reshaped = data.reshape(shape) rdd = self.sc.parallelize(reshaped) assert_equal(ArrayRDD(rdd).shape, shape)
def test_limit_features(self): X, X_rdd = self.make_text_rdd() params = [{ 'min_df': .5 }, { 'min_df': 2, 'max_df': .9 }, { 'min_df': 1, 'max_df': .6 }, { 'min_df': 2, 'max_features': 3 }] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist) result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true( check_rdd_dtype(rdd, { 'x': np.ndarray, 'y': tuple, 'z': tuple }))
def test_shape(self): data = np.arange(4000) shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)] for shape in shapes: rdd = self.sc.parallelize(data.reshape(shape)) assert_equal(ArrayRDD(rdd).shape, shape)
def test_size(self): data = np.arange(4000) shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)] for shape in shapes: reshaped = data.reshape(shape) rdd = self.sc.parallelize(reshaped) size = ArrayRDD(rdd).map(lambda x: x.size).sum() assert_equal(size, reshaped.size) assert_equal(ArrayRDD(rdd).size, reshaped.size)
def test_unblock(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), list(range(1000))) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), list(range(1000)))
def test_same_output(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer() dist = SparkDictVectorizer() result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = CountVectorizer() dist = SparkCountVectorizer() result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist)
def test_unblock(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), range(1000)) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.unblock() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked.collect(), range(1000))
def test_blocks_number(self): blocked = BlockRDD(self.generate(1000), bsize=50) assert_equal(blocked.blocks, 20) blocked = BlockRDD(self.generate(621), bsize=45) assert_equal(blocked.blocks, 20) blocked = BlockRDD(self.generate(100), bsize=4) assert_equal(blocked.blocks, 30) blocked = BlockRDD(self.generate(79, 2), bsize=9) assert_equal(blocked.blocks, 10) blocked = BlockRDD(self.generate(89, 2), bsize=5) assert_equal(blocked.blocks, 18)
def test_length(self): blocked = BlockRDD(self.generate(1000)) assert_equal(len(blocked), 1000) blocked = BlockRDD(self.generate(100)) assert_equal(len(blocked), 100) blocked = BlockRDD(self.generate(79)) assert_equal(len(blocked), 79) blocked = BlockRDD(self.generate(89)) assert_equal(len(blocked), 89) blocked = BlockRDD(self.generate(62)) assert_equal(len(blocked), 62)
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_sum(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data) assert_equal(ArrayRDD(rdd).sum(), data.sum()) assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0)) assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1)) data = np.arange(600).reshape((100, 3, 2)) rdd = self.sc.parallelize(data) assert_equal(ArrayRDD(rdd).sum(), data.sum()) assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0)) assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1)) assert_array_equal(ArrayRDD(rdd).sum(axis=2), data.sum(axis=2))
def test_convert_tolist(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data.tolist()) data = [2, 3, 5, 1, 6, 7, 9, 9] rdd = self.sc.parallelize(data, 2) X = ArrayRDD(rdd) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data)
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x ** 2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x ** 2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x**2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x**2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x**2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_creation(self): rdd = self.generate() blocked = BlockRDD(rdd) assert_is_instance(blocked, BlockRDD) assert_equal(blocked.first(), range(10)) assert_equal(blocked.collect(), np.arange(100).reshape(10, 10).tolist()) blocked = BlockRDD(rdd, bsize=4) assert_is_instance(blocked, BlockRDD) assert_equal(blocked.first(), range(4)) assert_equal([len(x) for x in blocked.collect()], [4, 4, 2] * 10)
def test_limit_features(self): X, X_rdd = self.make_text_rdd() params = [{'min_df': .5}, {'min_df': 2, 'max_df': .9}, {'min_df': 1, 'max_df': .6}, {'min_df': 2, 'max_features': 3}] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray()) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple, 'z': tuple}))
def test_partition_number(self): blocked = BlockRDD(self.generate(1000, 5), bsize=50) assert_equal(blocked.partitions, 5) blocked = BlockRDD(self.generate(621, 3), bsize=45) assert_equal(blocked.partitions, 3) blocked = BlockRDD(self.generate(100, 10)) assert_equal(blocked.partitions, 10)
def test_creation(self): rdd = self.generate() blocked = BlockRDD(rdd) assert_is_instance(blocked, BlockRDD) expected = tuple(range(10)) assert_equal(blocked.first(), expected) expected = [tuple(v) for v in np.arange(100).reshape(10, 10)] assert_equal(blocked.collect(), expected) blocked = BlockRDD(rdd, bsize=4) assert_is_instance(blocked, BlockRDD) expected = tuple(range(4)) assert_equal(blocked.first(), expected) expected = [4, 4, 2] * 10 assert_equal([len(x) for x in blocked.collect()], expected)
def test_dtype(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize(["lorem" for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=list) assert_array_equal(["lorem"] * 10, blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(["lorem"] * 10, blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=tuple) assert_array_equal(tuple([1] * (n_samples // n_partitions)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_array(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((10, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(np.ones((10, 1)), blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((n_samples // n_partitions, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_dtype(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize(["lorem" for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=list) assert_array_equal(["lorem"] * 10, blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(["lorem"] * 10, blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=tuple) assert_array_equal(tuple([1] * (n_samples / n_partitions)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_array(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((10, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(np.ones((10, 1)), blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((n_samples / n_partitions, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_tolist(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, range(1000)) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, range(1000)) blocked = BlockRDD(self.generate(1000, 5), dtype=np.array) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, range(1000))
def test_tolist(self): blocked = BlockRDD(self.generate(1000, 5)) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, list(range(1000))) blocked = BlockRDD(self.generate(1000, 5), dtype=tuple) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, list(range(1000))) blocked = BlockRDD(self.generate(1000, 5), dtype=np.array) unblocked = blocked.tolist() assert_is_instance(blocked, BlockRDD) assert_equal(unblocked, list(range(1000)))
def test_block_rdd_tuple(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize([(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], n_partitions) blocked_data = block(data) expected_first_block = np.array([[1., 2.]] * 10) expected_second_block = np.zeros(10, dtype=np.int) expected_third_block = sp.vstack([sparse_row] * 10) first_block_tuple = blocked_data.first() assert_array_almost_equal(expected_first_block, first_block_tuple[0]) assert_array_almost_equal(expected_second_block, first_block_tuple[1]) assert_array_almost_equal(expected_third_block.toarray(), first_block_tuple[2].toarray()) tuple_blocks = blocked_data.collect() assert_equal(len(tuple_blocks), n_partitions) assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
def test_block_rdd_tuple(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize( [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], n_partitions) blocked_data = block(data) expected_first_block = np.array([[1., 2.]] * 10) expected_second_block = np.zeros(10, dtype=np.int) expected_third_block = sp.vstack([sparse_row] * 10) first_block_tuple = blocked_data.first() assert_array_almost_equal(expected_first_block, first_block_tuple[0]) assert_array_almost_equal(expected_second_block, first_block_tuple[1]) assert_array_almost_equal(expected_third_block.toarray(), first_block_tuple[2].toarray()) tuple_blocks = blocked_data.collect() assert_equal(len(tuple_blocks), n_partitions) assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
def test_blocks_number(self): n_partitions = 10 n_samples = 1000 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_equal(1000, ArrayRDD(rdd, noblock=True, bsize=1).blocks) assert_equal(10, ArrayRDD(rdd).blocks) assert_equal(20, ArrayRDD(rdd, 50).blocks) assert_equal(20, ArrayRDD(rdd, 66).blocks) assert_equal(10, ArrayRDD(rdd, 100).blocks) assert_equal(10, ArrayRDD(rdd, 300).blocks) assert_equal(200, ArrayRDD(rdd, 5).blocks) assert_equal(100, ArrayRDD(rdd, 10).blocks)
def test_mean(self): data = np.arange(600).reshape((100, 3, 2)) rdd = self.sc.parallelize(data) assert_equal(ArrayRDD(rdd).mean(), data.mean()) assert_array_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0)) assert_array_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))
def test_partitions_number(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) assert_equal(ArrayRDD(rdd, 5).partitions, 4) assert_equal(ArrayRDD(rdd, 10).partitions, 4) assert_equal(ArrayRDD(rdd, 20).partitions, 4) data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 7) assert_equal(ArrayRDD(rdd, 5).partitions, 7) assert_equal(ArrayRDD(rdd, 10).partitions, 7) assert_equal(ArrayRDD(rdd, 20).partitions, 7)