def test_get_multiple_tuples(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)), (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))] assert_multiple_tuples_equal(z[:2].collect(), expected) assert_multiple_tuples_equal(z[:2, :].collect(), expected) assert_multiple_tuples_equal(z[[0, 1]].collect(), expected) assert_multiple_tuples_equal(z[[0, 1], :].collect(), expected) assert_multiple_tuples_equal(z[[1, 0]].collect(), expected[::-1]) expected = [(np.arange(50, 60).reshape((5, 2)), np.arange(25, 30)), (np.arange(60, 70).reshape((5, 2)), np.arange(30, 35)), (np.arange(70, 80).reshape((5, 2)), np.arange(35, 40))] assert_multiple_tuples_equal(z[-3:].collect(), expected) assert_multiple_tuples_equal(z[-3:, :].collect(), expected) assert_multiple_tuples_equal(z[[5, 6, 7]].collect(), expected) assert_multiple_tuples_equal(z[[5, 6, 7], :].collect(), expected) assert_multiple_tuples_equal(z[[7, 6, 5]].collect(), expected[::-1]) assert_multiple_tuples_equal(z[[7, 6, 5], :].collect(), expected[::-1]) assert_multiple_tuples_equal(z[[5, 7, 6]].collect(), [expected[0], expected[2], expected[1]])
def test_check_rdd_dtype(self): array = np.ndarray spmat = sp.spmatrix dense, dense_rdd = self.make_dense_rdd(block_size=5) sparse, sparse_rdd = self.make_sparse_rdd(block_size=5) dict_rdd = DictRDD( (dense_rdd, sparse_rdd), columns=('X', 'y'), bsize=5 ) assert_true(check_rdd_dtype(dense_rdd, array)) assert_true(check_rdd_dtype(dense_rdd, (array, spmat))) assert_true(check_rdd_dtype(sparse_rdd, spmat)) assert_true(check_rdd_dtype(dict_rdd, {'X': array})) assert_true(check_rdd_dtype(dict_rdd, {'y': spmat})) assert_true(check_rdd_dtype(dict_rdd, {'X': array, 'y': spmat})) assert_true(check_rdd_dtype(dict_rdd, {'X': (array, spmat), 'y': spmat})) assert_false(check_rdd_dtype(dense_rdd, spmat)) assert_false(check_rdd_dtype(sparse_rdd, (array,))) assert_false(check_rdd_dtype(dict_rdd, {'X': spmat})) assert_raises(TypeError, check_rdd_dtype, (dict_rdd, (tuple,))) assert_raises(TypeError, check_rdd_dtype, (np.arange(20), (array,)))
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x**2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x**2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x**2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_get_single_tuple(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) expected = np.arange(0, 10).reshape((5, 2)), np.arange(5) for tpl in [z.first(), z[0].first(), z[0].first()]: assert_tuple_equal(tpl, expected) expected = np.arange(30, 40).reshape((5, 2)), np.arange(15, 20) for tpl in [z[3].first(), z[3].first(), z[-5].first()]: assert_tuple_equal(tpl, expected) expected = np.arange(70, 80).reshape((5, 2)), np.arange(35, 40) for tpl in [z[7].first(), z[7].first(), z[-1].first()]: assert_tuple_equal(tpl, expected)
def test_creation_from_zipped_rdd(self): x = np.arange(80).reshape((40, 2)) y = range(40) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) zipped_rdd = x_rdd.zip(y_rdd) expected = (np.arange(20).reshape(10, 2), tuple(range(10))) rdd = DictRDD(zipped_rdd) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, columns=('x', 'y')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD(zipped_rdd, dtype=(np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[1], list)
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple, 'z': tuple}))
def make_regression(self, n_targets, n_samples, blocks=-1): X, y = make_regression(n_targets=n_targets, n_samples=n_samples, n_features=20, n_informative=10, random_state=42) X_rdd = ArrayRDD(self.sc.parallelize(X)) y_rdd = ArrayRDD(self.sc.parallelize(y)) Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks) return X, y, Z
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x ** 2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x ** 2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_auto_dtype(self): x = np.arange(80).reshape((40, 2)) y = tuple(range(40)) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = (np.arange(20).reshape(10, 2), tuple(range(10)), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple})) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) assert_equal(rdd.dtype, (np.ndarray, tuple, tuple)) assert_true( check_rdd_dtype(rdd, { 'x': np.ndarray, 'y': tuple, 'z': tuple }))
def test_creation_from_blocked_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_get_single_item(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) assert_array_equal(z[0, 0].first(), np.arange(0, 10).reshape((5, 2))) assert_array_equal(z[0, 1].first(), np.arange(5)) assert_array_equal(z[3, 0].first(), np.arange(30, 40).reshape((5, 2))) assert_array_equal(z[3, 1].first(), np.arange(15, 20)) # assert_array_equal(z[3, -1].first(), np.arange(15, 20)) assert_array_equal(z[7, 0].first(), np.arange(70, 80).reshape((5, 2))) assert_array_equal(z[-1, 0].first(), np.arange(70, 80).reshape((5, 2))) assert_array_equal(z[7, 1].first(), np.arange(35, 40))
def test_get_multiple_items(self): x, y = np.arange(80).reshape((40, 2)), np.arange(40) x_rdd = self.sc.parallelize(x, 2) y_rdd = self.sc.parallelize(y, 2) z_rdd = x_rdd.zip(y_rdd) z = DictRDD(z_rdd, bsize=5) expected = [(np.arange(0, 10).reshape((5, 2)), np.arange(0, 5)), (np.arange(10, 20).reshape((5, 2)), np.arange(5, 10))] assert_array_equal(z[:2, 1].collect(), [expected[0][1], expected[1][1]]) assert_array_equal(z[[0, 1], 0].collect(), [expected[0][0], expected[1][0]]) assert_multiple_tuples_equal(z[[0, 1], [1]].collect(), [(expected[0][1], ), (expected[1][1], )]) assert_multiple_tuples_equal(z[[0, 1], -1:].collect(), [(expected[0][1], ), (expected[1][1], )]) assert_multiple_tuples_equal(z[[1, 0], [1, 0]].collect(), [expected[1][::-1], expected[0][::-1]])
def make_classification(self, n_classes, n_samples, blocks=-1, nonnegative=False): X, y = make_classification(n_classes=n_classes, n_samples=n_samples, n_features=5, n_informative=4, n_redundant=0, n_clusters_per_class=1, random_state=42) if nonnegative: X = np.abs(X) X_rdd = ArrayRDD(self.sc.parallelize(X, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks) return X, y, Z
def test_creation_from_blocked_rdds(self): x, y, z = np.arange(80).reshape((40, 2)), np.arange(40), range(40) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), range(10)) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) local.fit(X_dense) dist.fit(X_dense_rdd) assert_array_almost_equal(local.variances_, dist.variances_) local.fit(X_sparse) dist.fit(X_sparse_rdd) assert_array_almost_equal(local.variances_, dist.variances_) dist.fit(Z) assert_array_almost_equal(local.variances_, dist.variances_)
def test_initialization(self): n_partitions = 4 n_samples = 100 data = [(1, 2) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_raises(TypeError, DictRDD, data) assert_raises(TypeError, DictRDD, data, bsize=False) assert_raises(TypeError, DictRDD, data, bsize=10) assert_is_instance(DictRDD(rdd), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD) assert_is_instance(DictRDD(rdd, bsize=10), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD) assert_is_instance(DictRDD(rdd, bsize=None), DictRDD) assert_is_instance(DictRDD(rdd), BlockRDD)
def test_same_transform_with_treshold(self): local = VarianceThreshold(.03) dist = SparkVarianceThreshold(.03) X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) result_local = local.fit_transform(X_dense) result_dist = dist.fit_transform(X_dense_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray, ))) assert_array_almost_equal(result_local, result_dist.toarray()) result_local = local.fit_transform(X_sparse) result_dist = dist.fit_transform(X_sparse_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray()) result_dist = dist.fit_transform(Z_rdd)[:, 'X'] assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, ))) assert_array_almost_equal(result_local.toarray(), result_dist.toarray())
def transform(self, Z): """TODO: rewrite docstring Transform X separately by each transformer, concatenate results. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Input data to be transformed. Returns ------- X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ if isinstance(Z, DictRDD): X = Z[:, 'X'] else: X = Z Zs = [ _transform_one(trans, name, X, self.transformer_weights) for name, trans in self.transformer_list ] X_rdd = reduce(lambda x, y: x.zip(y._rdd), Zs) X_rdd = X_rdd.map(flatten) mapper = np.hstack for item in X_rdd.first(): if sp.issparse(item): mapper = sp.hstack X_rdd = X_rdd.map(lambda x: mapper(x)) if isinstance(Z, DictRDD): return DictRDD([X_rdd, Z[:, 'y']], columns=Z.columns, dtype=Z.dtype, bsize=Z.bsize) else: return X_rdd
def test_creation_from_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = self.sc.parallelize(x, 4) y_rdd = self.sc.parallelize(y, 4) z_rdd = self.sc.parallelize(z, 4) expected = ( np.arange(20).reshape(10, 2), np.arange(10), list(range(10)) ) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(np.ndarray, np.ndarray, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_transform(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X1 = [(x[0], x[1]**2) for x in X.collect()] X2 = X.transform(lambda a, b: (a, b**2)) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0], x[1]**2) for x in X.collect()] X2 = X.transform(lambda x: x**2, column=1) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0]**2, x[1]) for x in X.collect()] X2 = X.transform(lambda x: x**2, column=0) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0]**2, x[1]**0.5) for x in X.collect()] X2 = X.transform(lambda a, b: (a**2, b**0.5), column=[0, 1]) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0]**2, x[1]**0.5) for x in X.collect()] X2 = X.transform(lambda b, a: (b**0.5, a**2), column=[1, 0]) assert_multiple_tuples_equal(X1, X2.collect())
df = pd.read_csv("review.csv", header=None, encoding='latin1') df[0] = df[0].apply(lambda death: 0 if death <= 5 else 1) df = df.dropna() data = df[1] target = df[0] list = [] data_train, data_test, target_train, target_test = cross_validation.train_test_split( data, target, test_size=0.25, random_state=43) # train data toRDD train_x = sc.parallelize(data_train) train_y = sc.parallelize(target_train) train_x = ArrayRDD(train_x) train_y = ArrayRDD(train_y) Z = DictRDD((train_x, train_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # pipeline dist_pipeline = SparkPipeline(( ('vect', SparkHashingVectorizer(non_negative=True)), # hashingTF for NB ('tfidf', SparkTfidfTransformer()), # IDF ('clf', SparkMultinomialNB(alpha=0.05)) # NB )) # fit dist_pipeline.fit(Z, clf__classes=np.array([0, 1])) # test data to RDD test_x = ArrayRDD(sc.parallelize(data_test)) test_y = ArrayRDD(sc.parallelize(target_test))
def test_transform(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X1 = [(x[0], x[1] ** 2) for x in X.collect()] X2 = X.transform(lambda a, b: (a, b ** 2)).collect() assert_multiple_tuples_equal(X1, X2) X1 = [(x[0], x[1] ** 2) for x in X.collect()] X2 = X.transform(lambda x: x ** 2, column=1).collect() assert_multiple_tuples_equal(X1, X2) X1 = [(x[0] ** 2, x[1]) for x in X.collect()] X2 = X.transform(lambda x: x ** 2, column=0).collect() assert_multiple_tuples_equal(X1, X2) X1 = [(x[0] ** 2, x[1] ** 0.5) for x in X.collect()] X2 = X.transform(lambda a, b: (a ** 2, b ** 0.5), column=[0, 1]) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0] ** 2, x[1] ** 0.5) for x in X.collect()] X2 = X.transform(lambda b, a: (b ** 0.5, a ** 2), column=[1, 0]) assert_multiple_tuples_equal(X1, X2.collect())