def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true( check_rdd_dtype(rdd, { 'x': np.ndarray, 'y': tuple, 'z': tuple }))
def test_same_prediction(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray, ))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_classification(4, 100000, nonnegative=True) local = MultinomialNB() dist = SparkMultinomialNB() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_block_rdd_dict(self): n_partitions = 3 n_samples = 57 dicts = [{'a': i, 'b': float(i)**2} for i in range(n_samples)] data = self.sc.parallelize(dicts, n_partitions) block_data_5 = block(data, bsize=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) assert_array_almost_equal(blocks[0][0], np.arange(5)) assert_array_almost_equal(blocks[0][1], np.arange(5, dtype=np.float)**2)
def test_block_rdd_dict(self): n_partitions = 3 n_samples = 57 dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)] data = self.sc.parallelize(dicts, n_partitions) block_data_5 = block(data, bsize=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) assert_array_almost_equal(blocks[0][0], np.arange(5)) assert_array_almost_equal(blocks[0][1], np.arange(5, dtype=np.float) ** 2)
def test_same_transform_result(self): X, y, Z_rdd = self.make_classification(4, 1000, -1) X_rdd = Z_rdd[:, 'X'] local = TfidfTransformer() dist = SparkTfidfTransformer() Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(Z_dist, sp.spmatrix)) assert_array_almost_equal(Z_local.toarray(), Z_dist.toarray())
def test_sparse_matrix(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize([sparse_row for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_true(sp.issparse(blocked_data.first())) expected_block = sp.vstack([sparse_row] * 10) assert_array_almost_equal(expected_block.toarray(), blocked_data.first().toarray())
def test_array_bsize(self): n_partitions = 10 n_samples = 107 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) block_data_5 = block(data, bsize=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) block_data_10 = block(data, bsize=10) blocks = block_data_10.collect() assert_true(all(len(b) <= 10 for b in blocks))
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x ** 2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x ** 2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x**2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x**2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x**2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_same_prediction(self): X, y, Z = self.make_classification(2, 80000) local = SGDClassifier(average=True) dist = SparkSGDClassifier(average=True) local.fit(X, y) dist.fit(Z, classes=np.unique(y)) y_local = local.predict(X) y_dist = np.concatenate(dist.predict(Z[:, 'X']).collect()) mismatch = y_local.shape[0] - np.count_nonzero(y_dist == y_local) mismatch_percent = float(mismatch) * 100 / y_local.shape[0] assert_true(mismatch_percent <= 1)
def test_blocks_size(self): n_partitions = 10 n_samples = 1000 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) shapes = ArrayRDD(rdd).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 100)) shapes = ArrayRDD(rdd, 5).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 5)) shapes = ArrayRDD(rdd, 50).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 50)) shapes = ArrayRDD(rdd, 250).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 100)) shapes = ArrayRDD(rdd, 66).map(lambda x: x.shape[0]).collect() assert_true(all(np.in1d(shapes, [66, 34])))
def test_same_prediction(self): X, y, Z = self.make_classification(2, 80000) local = SGDClassifier(average=True) dist = SparkSGDClassifier(average=True) local.fit(X, y) dist.fit(Z, classes=np.unique(y)) y_local = local.predict(X) y_dist = dist.predict(Z[:, 'X']) mismatch = y_local.shape[0] - np.count_nonzero(y_dist.toarray() == y_local) mismatch_percent = float(mismatch) * 100 / y_local.shape[0] assert_true(mismatch_percent <= 1) assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
def test_same_prediction(self): X, y, Z = self.make_classification(2, 80000) local = SGDClassifier(average=True) dist = SparkSGDClassifier(average=True) local.fit(X, y) dist.fit(Z, classes=np.unique(y)) y_local = local.predict(X) y_dist = dist.predict(Z[:, 'X']) mismatch = y_local.shape[0] - np.count_nonzero( y_dist.toarray() == y_local) mismatch_percent = float(mismatch) * 100 / y_local.shape[0] assert_true(mismatch_percent <= 1) assert_true(check_rdd_dtype(y_dist, (np.ndarray, )))
def test_same_fit_transforms(self): X, X_rdd = self.make_dense_rdd((1e3, 12)) n_components = 4 random_state = 42 tol = 1e-7 local = TruncatedSVD(n_components, n_iter=5, tol=tol, random_state=random_state) dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol, random_state=random_state) Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd) Z_collected = Z_dist.toarray() assert_true(check_rdd_dtype(Z_dist, (np.ndarray,))) tol = 1e-1 assert_array_equal(Z_local.shape, Z_collected.shape) assert(np.allclose(+Z_collected[:, 0], Z_local[:, 0], atol=tol) | np.allclose(-Z_collected[:, 0], Z_local[:, 0], atol=tol))
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple, 'z': tuple}))
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray, ))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())