def test_check_rdd_dtype(self): array = np.ndarray spmat = sp.spmatrix dense, dense_rdd = self.make_dense_rdd(block_size=5) sparse, sparse_rdd = self.make_sparse_rdd(block_size=5) dict_rdd = DictRDD( (dense_rdd, sparse_rdd), columns=('X', 'y'), bsize=5 ) assert_true(check_rdd_dtype(dense_rdd, array)) assert_true(check_rdd_dtype(dense_rdd, (array, spmat))) assert_true(check_rdd_dtype(sparse_rdd, spmat)) assert_true(check_rdd_dtype(dict_rdd, {'X': array})) assert_true(check_rdd_dtype(dict_rdd, {'y': spmat})) assert_true(check_rdd_dtype(dict_rdd, {'X': array, 'y': spmat})) assert_true(check_rdd_dtype(dict_rdd, {'X': (array, spmat), 'y': spmat})) assert_false(check_rdd_dtype(dense_rdd, spmat)) assert_false(check_rdd_dtype(sparse_rdd, (array,))) assert_false(check_rdd_dtype(dict_rdd, {'X': spmat})) assert_raises(TypeError, check_rdd_dtype, (dict_rdd, (tuple,))) assert_raises(TypeError, check_rdd_dtype, (np.arange(20), (array,)))
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x**2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x**2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x**2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true( check_rdd_dtype(rdd, { 'x': np.ndarray, 'y': tuple, 'z': tuple }))
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x ** 2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x ** 2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_same_prediction(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray, ))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_classification(4, 100000, nonnegative=True) local = MultinomialNB() dist = SparkMultinomialNB() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_transform_result(self): X, y, Z_rdd = self.make_classification(4, 1000, -1) X_rdd = Z_rdd[:, 'X'] local = TfidfTransformer() dist = SparkTfidfTransformer() Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(Z_dist, sp.spmatrix)) assert_array_almost_equal(Z_local.toarray(), Z_dist.toarray())
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple, 'z': tuple}))
def test_same_predictions(self): X, y, Z = self.make_classification(2, 10000) local = RandomForestClassifier() dist = SparkPseudoRandomForestClassifier() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X']) y_conv = dist.to_scikit().predict(X) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert(sum(y_local != y_dist.toarray()) < len(y_local) * 2./100.) assert(sum(y_local != y_conv) < len(y_local) * 2./100.)
def test_same_predictions(self): X, y, Z = self.make_classification(2, 10000) local = RandomForestClassifier() dist = SparkRandomForestClassifier() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X']) y_conv = dist.to_scikit().predict(X) assert_true(check_rdd_dtype(y_dist, (np.ndarray, ))) assert (sum(y_local != y_dist.toarray()) < len(y_local) * 2. / 100.) assert (sum(y_local != y_conv) < len(y_local) * 2. / 100.)
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray, ))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_classification(2, 80000) local = SGDClassifier(average=True) dist = SparkSGDClassifier(average=True) local.fit(X, y) dist.fit(Z, classes=np.unique(y)) y_local = local.predict(X) y_dist = dist.predict(Z[:, 'X']) mismatch = y_local.shape[0] - np.count_nonzero(y_dist.toarray() == y_local) mismatch_percent = float(mismatch) * 100 / y_local.shape[0] assert_true(mismatch_percent <= 1) assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
def test_same_prediction(self): X, y, Z = self.make_classification(2, 80000) local = SGDClassifier(average=True) dist = SparkSGDClassifier(average=True) local.fit(X, y) dist.fit(Z, classes=np.unique(y)) y_local = local.predict(X) y_dist = dist.predict(Z[:, 'X']) mismatch = y_local.shape[0] - np.count_nonzero( y_dist.toarray() == y_local) mismatch_percent = float(mismatch) * 100 / y_local.shape[0] assert_true(mismatch_percent <= 1) assert_true(check_rdd_dtype(y_dist, (np.ndarray, )))
def test_same_fit_transforms(self): X, X_rdd = self.make_dense_rdd((1e3, 12)) n_components = 4 random_state = 42 tol = 1e-7 local = TruncatedSVD(n_components, n_iter=5, tol=tol, random_state=random_state) dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol, random_state=random_state) Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd) Z_collected = Z_dist.toarray() assert_true(check_rdd_dtype(Z_dist, (np.ndarray,))) tol = 1e-1 assert_array_equal(Z_local.shape, Z_collected.shape) assert(np.allclose(+Z_collected[:, 0], Z_local[:, 0], atol=tol) | np.allclose(-Z_collected[:, 0], Z_local[:, 0], atol=tol))