def test_true_divide(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A / B assert_array_equal( A_rdd.true_divide(B).toarray(), np_res )
def test_unblocking_rdd(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_unblocked = X.unblock() assert_is_instance(X_unblocked, RDD) assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
def test_remainder(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = np.remainder(A, B) assert_array_equal( A_rdd.remainder(B).toarray(), np_res )
def test_same_classes(self): Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000, )) local = LabelEncoder().fit(Y) dist = SparkLabelEncoder().fit(Y_rdd) assert_array_equal(local.classes_, dist.classes_)
def test_same_fit_transform(self): Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,)) local = LabelEncoder() dist = SparkLabelEncoder() assert_array_equal(local.fit_transform(Y), dist.fit_transform(Y_rdd).toarray())
def test_same_classes(self): Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,)) local = LabelEncoder().fit(Y) dist = SparkLabelEncoder().fit(Y_rdd) assert_array_equal(local.classes_, dist.classes_)
def test_limit_features(self): X, X_rdd = self.make_text_rdd() params = [{ 'min_df': .5 }, { 'min_df': 2, 'max_df': .9 }, { 'min_df': 1, 'max_df': .6 }, { 'min_df': 2, 'max_features': 3 }] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist) result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_fmod(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = np.fmod(A, B) assert_array_equal( A_rdd.fmod(B).toarray(), np_res )
def test_same_fit_transform(self): Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000, )) local = LabelEncoder() dist = SparkLabelEncoder() assert_array_equal(local.fit_transform(Y), dist.fit_transform(Y_rdd).toarray())
def test_transform(self): X, X_rdd = self.make_dense_rdd((100, 4)) fn = lambda x: x**2 X1 = list(map(fn, X_rdd.collect())) X2 = X_rdd.transform(fn).collect() assert_array_equal(X1, X2)
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_transform(self): X, X_rdd = self.make_dense_rdd((100, 4)) fn = lambda x: x ** 2 X1 = list(map(fn, X_rdd.collect())) X2 = X_rdd.transform(fn).collect() assert_array_equal(X1, X2)
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = CountVectorizer() dist = SparkCountVectorizer() result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist)
def test_transform(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) fn = lambda x: x ** 2 X1 = map(fn, X.collect()) X2 = X.transform(fn).collect() assert_array_equal(X1, X2)
def test_same_output(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer() dist = SparkDictVectorizer() result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_inverse_transform(self): Y, Y_rdd = self.make_dense_randint_rdd((1000,), low_high=(0, 10)) local = LabelEncoder().fit(Y) dist = SparkLabelEncoder().fit(Y_rdd) assert_array_equal( local.inverse_transform(Y), dist.inverse_transform(Y_rdd).toarray() )
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_convert_toarray(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_array = X.toarray() assert_array_equal(X_array, data) data = [2, 3, 5, 1, 6, 7, 9, 9] rdd = self.sc.parallelize(data, 2) X = ArrayRDD(rdd) X_array = X.toarray() assert_array_equal(X_array, np.array(data))
def test_sum(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data) assert_equal(ArrayRDD(rdd).sum(), data.sum()) assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0)) assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1)) data = np.arange(600).reshape((100, 3, 2)) rdd = self.sc.parallelize(data) assert_equal(ArrayRDD(rdd).sum(), data.sum()) assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0)) assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1)) assert_array_equal(ArrayRDD(rdd).sum(axis=2), data.sum(axis=2))
def test_dummy_analyzer(self): X, X_rdd = self.make_text_rdd() def splitter(x): return x.split() X = map(splitter, X) X_rdd = X_rdd.map(lambda x: map(splitter, x)) local = HashingVectorizer(analyzer=lambda x: x) dist = SparkHashingVectorizer(analyzer=lambda x: x) result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray()) result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_fit_transforms(self): X, X_rdd = self.make_dense_rdd((1e3, 12)) n_components = 4 random_state = 42 tol = 1e-7 local = TruncatedSVD(n_components, n_iter=5, tol=tol, random_state=random_state) dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol, random_state=random_state) Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd).toarray() tol = 1e-1 assert_array_equal(Z_local.shape, Z_dist.shape) assert(np.allclose(+Z_dist[:, 0], Z_local[:, 0], atol=tol) | np.allclose(-Z_dist[:, 0], Z_local[:, 0], atol=tol))
def test_get_multiple_items(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)), (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))] assert_array_equal(z[:2, 1].collect(), [expected[0][1], expected[1][1]]) assert_array_equal(z[[0, 1], 0].collect(), [expected[0][0], expected[1][0]]) assert_multiple_tuples_equal(z[[0, 1], [1]].collect(), [(expected[0][1], ), (expected[1][1], )]) assert_multiple_tuples_equal(z[[0, 1], -1:].collect(), [(expected[0][1], ), (expected[1][1], )]) assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(), [expected[1][::-1], expected[0][::-1]])
def test_dummy_analyzer(self): X, X_rdd = self.make_text_rdd() def splitter(x): return x.split() X = list(map(splitter, X)) X_rdd = X_rdd.map(lambda x: list(map(splitter, x))) local = HashingVectorizer(analyzer=lambda x: x) dist = SparkHashingVectorizer(analyzer=lambda x: x) result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_limit_features(self): X, X_rdd = self.make_text_rdd() params = [{'min_df': .5}, {'min_df': 2, 'max_df': .9}, {'min_df': 1, 'max_df': .6}, {'min_df': 2, 'max_features': 3}] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray()) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_fit_transforms(self): X, X_rdd = self.make_dense_rdd((1e3, 12)) n_components = 4 random_state = 42 tol = 1e-7 local = TruncatedSVD(n_components, n_iter=5, tol=tol, random_state=random_state) dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol, random_state=random_state) Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd) Z_collected = Z_dist.toarray() assert_true(check_rdd_dtype(Z_dist, (np.ndarray,))) tol = 1e-1 assert_array_equal(Z_local.shape, Z_collected.shape) assert(np.allclose(+Z_collected[:, 0], Z_local[:, 0], atol=tol) | np.allclose(-Z_collected[:, 0], Z_local[:, 0], atol=tol))
def test_get_multiple_items(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)), (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))] assert_array_equal(z[:2, 1].collect(), [expected[0][1], expected[1][1]]) assert_array_equal(z[[0, 1], 0].collect(), [expected[0][0], expected[1][0]]) assert_multiple_tuples_equal(z[[0, 1], [1]].collect(), [(expected[0][1],), (expected[1][1],)]) assert_multiple_tuples_equal(z[[0, 1], -1:].collect(), [(expected[0][1],), (expected[1][1],)]) assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(), [expected[1][::-1], expected[0][::-1]])
def test_subtract(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A - B assert_array_equal(A_rdd.subtract(B).toarray(), np_res) assert_array_equal((A_rdd - B).toarray(), np_res) A_rdd -= B assert_array_equal(A_rdd.toarray(), np_res)
def test_multiply(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A * B assert_array_equal(A_rdd.multiply(B).toarray(), np_res) assert_array_equal((A_rdd * B).toarray(), np_res) A_rdd *= B assert_array_equal(A_rdd.toarray(), np_res)
def test_power(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A**B assert_array_equal(A_rdd.power(B).toarray(), np_res) assert_array_equal((A_rdd**B).toarray(), np_res) A_rdd **= B assert_array_equal(A_rdd.toarray(), np_res)
def test_floor_divide(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A // B assert_array_equal(A_rdd.floor_divide(B).toarray(), np_res) assert_array_equal((A_rdd // B).toarray(), np_res) A_rdd //= B assert_array_equal(A_rdd.toarray(), np_res)
def test_add(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A + B assert_array_equal(A_rdd.add(B).toarray(), np_res) assert_array_equal((A_rdd + B).toarray(), np_res) A_rdd += B assert_array_equal(A_rdd.toarray(), np_res)
def test_mod(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A % B assert_array_equal(A_rdd.mod(B).toarray(), np_res) assert_array_equal((A_rdd % B).toarray(), np_res) A_rdd %= B assert_array_equal(A_rdd.toarray(), np_res)
def test_floor_divide(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A // B assert_array_equal( A_rdd.floor_divide(B).toarray(), np_res ) assert_array_equal((A_rdd // B).toarray(), np_res) A_rdd //= B assert_array_equal(A_rdd.toarray(), np_res)
def test_add(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A + B assert_array_equal( A_rdd.add(B).toarray(), np_res ) assert_array_equal((A_rdd + B).toarray(), np_res) A_rdd += B assert_array_equal(A_rdd.toarray(), np_res)
def test_multiply(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A * B assert_array_equal( A_rdd.multiply(B).toarray(), np_res ) assert_array_equal((A_rdd * B).toarray(), np_res) A_rdd *= B assert_array_equal(A_rdd.toarray(), np_res)
def test_subtract(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A - B assert_array_equal( A_rdd.subtract(B).toarray(), np_res ) assert_array_equal((A_rdd - B).toarray(), np_res) A_rdd -= B assert_array_equal(A_rdd.toarray(), np_res)
def test_power(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A ** B assert_array_equal( A_rdd.power(B).toarray(), np_res ) assert_array_equal((A_rdd ** B).toarray(), np_res) A_rdd **= B assert_array_equal(A_rdd.toarray(), np_res)
def test_mod(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A % B assert_array_equal( A_rdd.mod(B).toarray(), np_res ) assert_array_equal((A_rdd % B).toarray(), np_res) A_rdd %= B assert_array_equal(A_rdd.toarray(), np_res)
def test_dtype(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize(["lorem" for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=list) assert_array_equal(["lorem"] * 10, blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(["lorem"] * 10, blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=tuple) assert_array_equal(tuple([1] * (n_samples // n_partitions)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_dtype(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize(["lorem" for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=list) assert_array_equal(["lorem"] * 10, blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(["lorem"] * 10, blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions) blocked_data = block(data, dtype=tuple) assert_array_equal(tuple([1] * (n_samples / n_partitions)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_array(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((10, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(np.ones((10, 1)), blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((n_samples / n_partitions, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_array(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((10, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_equal(np.ones((10, 1)), blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_array_equal(np.ones((n_samples // n_partitions, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_get_multiple_item(self): X, X_rdd = self.make_dense_range_rdd((100, 4), block_size=5) exp0th = np.arange(0, 20).reshape((5, 4)) exp1st = np.arange(20, 40).reshape((5, 4)) exp2nd = np.arange(40, 60).reshape((5, 4)) exp7th = np.arange(140, 160).reshape((5, 4)) exp18th = np.arange(360, 380).reshape((5, 4)) exp19th = np.arange(380, 400).reshape((5, 4)) assert_array_equal(X_rdd[[0, 1]].collect(), [exp0th, exp1st]) assert_array_equal(X_rdd[[0, 2]].collect(), [exp0th, exp2nd]) assert_array_equal(X_rdd[[0, -1]].collect(), [exp0th, exp19th]) assert_array_equal(X_rdd[[0, -2]].collect(), [exp0th, exp18th]) assert_array_equal(X_rdd[[1, -2]].collect(), [exp1st, exp18th]) assert_array_equal(X_rdd[[7, 0]].collect(), [exp7th, exp0th]) assert_array_equal(X_rdd[[1, 2, 7, 19]].collect(), [exp1st, exp2nd, exp7th, exp19th])
def test_get_single_item(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2))) assert_array_equal(z[0, 1].first(), np.arange(5)) assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2))) assert_array_equal(z[3, 1].first(), np.arange(15, 20)) # assert_array_equal(z[3, -1].first(), np.arange(15, 20)) assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2))) assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2))) assert_array_equal(z[7, 1].first(), np.arange(35, 40))
def test_flatten(self): X, X_rdd = self.make_dense_rdd((100, 3, 2)) X = X.flatten() X_rdd = X_rdd.flatten() assert_array_equal(X_rdd.toarray(), X)
def test_remainder(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = np.remainder(A, B) assert_array_equal(A_rdd.remainder(B).toarray(), np_res)
def test_fmod(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = np.fmod(A, B) assert_array_equal(A_rdd.fmod(B).toarray(), np_res)
def test_array_slice_syntax(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) exp0th = np.arange(0, 20).reshape((5, 4)) exp1st = np.arange(20, 40).reshape((5, 4)) exp7th = np.arange(140, 160).reshape((5, 4)) exp8th = np.arange(160, 180).reshape((5, 4)) exp9th = np.arange(180, 200).reshape((5, 4)) exp18th = np.arange(360, 380).reshape((5, 4)) exp19th = np.arange(380, 400).reshape((5, 4)) assert_array_equal(X[:1].collect(), [exp0th]) assert_array_equal(X[:2].collect(), [exp0th, exp1st]) assert_array_equal(X[18:].collect(), [exp18th, exp19th]) assert_array_equal(X[-1:].collect(), [exp19th]) assert_array_equal(X[-2:].collect(), [exp18th, exp19th]) assert_array_equal(X[7:10].collect(), [exp7th, exp8th, exp9th]) assert_array_equal(X[7:10:2].collect(), [exp7th, exp9th]) assert_array_equal(X[::9].collect(), [exp0th, exp9th, exp18th]) assert_array_equal(X[::-10].collect(), [exp19th, exp9th]) assert_array_equal(X[-1:1].collect(), [])
def test_true_divide(self): A, A_rdd = self.make_dense_rdd((8, 3)) B, B_rdd = self.make_dense_rdd((1, 3)) np_res = A / B assert_array_equal(A_rdd.true_divide(B).toarray(), np_res)
def test_mean(self): data = np.arange(600).reshape((100, 3, 2)) rdd = self.sc.parallelize(data) assert_equal(ArrayRDD(rdd).mean(), data.mean()) assert_array_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0)) assert_array_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))
def test_get_single_item(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) expected = np.arange(0, 20).reshape((5, 4)) assert_array_equal(X.first(), expected) assert_array_equal(X[0].first(), expected) expected = np.arange(20, 40).reshape((5, 4)) assert_array_equal(X[1].first(), expected) expected = np.arange(380, 400).reshape((5, 4)) assert_array_equal(X[19].first(), expected) assert_array_equal(X[-1].first(), expected) expected = np.arange(340, 360).reshape((5, 4)) assert_array_equal(X[17].first(), expected) assert_array_equal(X[-3].first(), expected)
def test_dot(self): A, A_rdd = self.make_dense_rdd((20, 10)) B, B_rdd = self.make_dense_rdd((10, 20)) assert_array_equal(A_rdd.dot(B).toarray(), A.dot(B)) assert_array_equal(B_rdd.dot(A).toarray(), B.dot(A))
def test_array_slice_syntax(self): X, X_rdd = self.make_dense_range_rdd((100, 4), block_size=5) exp0th = np.arange(0, 20).reshape((5, 4)) exp1st = np.arange(20, 40).reshape((5, 4)) exp7th = np.arange(140, 160).reshape((5, 4)) exp8th = np.arange(160, 180).reshape((5, 4)) exp9th = np.arange(180, 200).reshape((5, 4)) exp18th = np.arange(360, 380).reshape((5, 4)) exp19th = np.arange(380, 400).reshape((5, 4)) assert_array_equal(X_rdd[:1].collect(), [exp0th]) assert_array_equal(X_rdd[:2].collect(), [exp0th, exp1st]) assert_array_equal(X_rdd[18:].collect(), [exp18th, exp19th]) assert_array_equal(X_rdd[-1:].collect(), [exp19th]) assert_array_equal(X_rdd[-2:].collect(), [exp18th, exp19th]) assert_array_equal(X_rdd[7:10].collect(), [exp7th, exp8th, exp9th]) assert_array_equal(X_rdd[7:10:2].collect(), [exp7th, exp9th]) assert_array_equal(X_rdd[::9].collect(), [exp0th, exp9th, exp18th]) assert_array_equal(X_rdd[::-10].collect(), [exp19th, exp9th]) assert_array_equal(X_rdd[-1:1].collect(), [])