def test_unblocking_rdd(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_unblocked = X.unblock() assert_is_instance(X_unblocked, RDD) assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
def test_convert_toiter(self): data = np.arange(40) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_iter = X.toiter() assert_is_instance(X_iter, collections.Iterator) assert_array_equal(list(X_iter), X.collect())
def test_size(self): data = np.arange(4000) shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)] for shape in shapes: reshaped = data.reshape(shape) rdd = self.sc.parallelize(reshaped) size = ArrayRDD(rdd).map(lambda x: x.size).sum() assert_equal(size, reshaped.size) assert_equal(ArrayRDD(rdd).size, reshaped.size)
def test_transform(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) fn = lambda x: x ** 2 X1 = map(fn, X.collect()) X2 = X.transform(fn).collect() assert_array_equal(X1, X2)
def test_convert_toarray(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_array = X.toarray() assert_array_equal(X_array, data) data = [2, 3, 5, 1, 6, 7, 9, 9] rdd = self.sc.parallelize(data, 2) X = ArrayRDD(rdd) X_array = X.toarray() assert_array_equal(X_array, np.array(data))
def make_regression(self, n_targets, n_samples, blocks=-1): X, y = make_regression(n_targets=n_targets, n_samples=n_samples, n_features=20, n_informative=10, random_state=42) X_rdd = ArrayRDD(self.sc.parallelize(X)) y_rdd = ArrayRDD(self.sc.parallelize(y)) Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks) return X, y, Z
def test_convert_tolist(self): data = np.arange(400) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data.tolist()) data = [2, 3, 5, 1, 6, 7, 9, 9] rdd = self.sc.parallelize(data, 2) X = ArrayRDD(rdd) X_list = X.tolist() assert_is_instance(X_list, list) assert_equal(X_list, data)
def test_get_single_item(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) expected = np.arange(0, 20).reshape((5, 4)) assert_array_equal(X.first(), expected) assert_array_equal(X[0].first(), expected) assert_array_equal(X.ix(0).first(), expected) expected = np.arange(20, 40).reshape((5, 4)) assert_array_equal(X[1].first(), expected) assert_array_equal(X.ix(1).first(), expected) expected = np.arange(380, 400).reshape((5, 4)) assert_array_equal(X[19].first(), expected) assert_array_equal(X.ix(19).first(), expected) assert_array_equal(X[-1].first(), expected) assert_array_equal(X.ix(-1).first(), expected) expected = np.arange(340, 360).reshape((5, 4)) assert_array_equal(X[17].first(), expected) assert_array_equal(X.ix(17).first(), expected) assert_array_equal(X[-3].first(), expected) assert_array_equal(X.ix(-3).first(), expected)
def test_initialization(self): n_partitions = 4 n_samples = 100 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_raises(TypeError, ArrayRDD, data) assert_raises(TypeError, ArrayRDD, data, False) assert_raises(TypeError, ArrayRDD, data, 10) assert_is_instance(ArrayRDD(rdd), ArrayRDD) assert_is_instance(ArrayRDD(rdd, 10), ArrayRDD) assert_is_instance(ArrayRDD(rdd, None), ArrayRDD)
def make_dict_dataset(self, blocks=-1): X = [{ "foo": 1, "bar": 3 }, { "bar": 4, "baz": 2 }, { "bar": 6, "baz": 1 }, { "bar": 4, "ewo": "ok" }, { "bar": 4, "baz": 2 }, { "bar": 9, "ewo": "fail" }, { "bar": 4, "baz": 2 }, { "bar": 1, "quux": 1, "quuux": 2 }] X_rdd = ArrayRDD(self.sc.parallelize(X, 4), blocks) return X, X_rdd
def test_ndim(self): data = np.arange(4000) shapes = [(4000), (1000, 4), (200, 10, 2), (100, 10, 2, 2)] for shape in shapes: reshaped = data.reshape(shape) rdd = self.sc.parallelize(reshaped) assert_equal(ArrayRDD(rdd).ndim, reshaped.ndim)
def test_shape(self): data = np.arange(4000) shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)] for shape in shapes: reshaped = data.reshape(shape) rdd = self.sc.parallelize(reshaped) assert_equal(ArrayRDD(rdd).shape, shape)
def test_blocks_size(self): n_partitions = 10 n_samples = 1000 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) shapes = ArrayRDD(rdd).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 100)) shapes = ArrayRDD(rdd, 5).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 5)) shapes = ArrayRDD(rdd, 50).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 50)) shapes = ArrayRDD(rdd, 250).map(lambda x: x.shape[0]).collect() assert_true(all(np.array(shapes) == 100)) shapes = ArrayRDD(rdd, 66).map(lambda x: x.shape[0]).collect() assert_true(all(np.in1d(shapes, [66, 34])))
def make_dense_randint_rdd(self, low, high=None, shape=(1e3, 10), block_size=-1): X = np.random.randint(low, high, size=shape) X_rdd = ArrayRDD(self.sc.parallelize(X, 4), bsize=block_size) return X, X_rdd
def test_creation_from_blocked_rdds(self): x = np.arange(80).reshape((40, 2)) y = np.arange(40) z = list(range(40)) x_rdd = ArrayRDD(self.sc.parallelize(x, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list) expected = (np.arange(20).reshape(10, 2), np.arange(10), list(range(10))) rdd = DictRDD([x_rdd, y_rdd, z_rdd]) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z')) assert_tuple_equal(rdd.first(), expected) rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list)) first = rdd.first() assert_tuple_equal(first, expected) assert_is_instance(first[2], list)
def test_get_single_item(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) X = ArrayRDD(rdd, 5) expected = np.arange(0, 20).reshape((5, 4)) assert_array_equal(X.first(), expected) assert_array_equal(X[0].first(), expected) expected = np.arange(20, 40).reshape((5, 4)) assert_array_equal(X[1].first(), expected) expected = np.arange(380, 400).reshape((5, 4)) assert_array_equal(X[19].first(), expected) assert_array_equal(X[-1].first(), expected) expected = np.arange(340, 360).reshape((5, 4)) assert_array_equal(X[17].first(), expected) assert_array_equal(X[-3].first(), expected)
def make_classification(self, n_classes, n_samples, blocks=-1, nonnegative=False): X, y = make_classification(n_classes=n_classes, n_samples=n_samples, n_features=5, n_informative=4, n_redundant=0, n_clusters_per_class=1, random_state=42) if nonnegative: X = np.abs(X) X_rdd = ArrayRDD(self.sc.parallelize(X, 4)) y_rdd = ArrayRDD(self.sc.parallelize(y, 4)) Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks) return X, y, Z
def test_blocks_number(self): n_partitions = 10 n_samples = 1000 data = [np.array([1, 2]) for i in range(n_samples)] rdd = self.sc.parallelize(data, n_partitions) assert_equal(1000, ArrayRDD(rdd, noblock=True, bsize=1).blocks) assert_equal(10, ArrayRDD(rdd).blocks) assert_equal(20, ArrayRDD(rdd, 50).blocks) assert_equal(20, ArrayRDD(rdd, 66).blocks) assert_equal(10, ArrayRDD(rdd, 100).blocks) assert_equal(10, ArrayRDD(rdd, 300).blocks) assert_equal(200, ArrayRDD(rdd, 5).blocks) assert_equal(100, ArrayRDD(rdd, 10).blocks)
def test_partitions_number(self): data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 4) assert_equal(ArrayRDD(rdd, 5).partitions, 4) assert_equal(ArrayRDD(rdd, 10).partitions, 4) assert_equal(ArrayRDD(rdd, 20).partitions, 4) data = np.arange(400).reshape((100, 4)) rdd = self.sc.parallelize(data, 7) assert_equal(ArrayRDD(rdd, 5).partitions, 7) assert_equal(ArrayRDD(rdd, 10).partitions, 7) assert_equal(ArrayRDD(rdd, 20).partitions, 7)
def make_dense_range_rdd(self, shape=(1e3, 10), block_size=-1): X = np.arange(np.prod(shape)).reshape(shape) X_rdd = ArrayRDD(self.sc.parallelize(X, 4), bsize=block_size) return X, X_rdd
def make_text_rdd(self, blocks=-1): X = ALL_FOOD_DOCS X_rdd = ArrayRDD(self.sc.parallelize(X, 4), blocks) return X, X_rdd
def make_dense_rdd(self, shape=(1e3, 10), block_size=-1): rng = np.random.RandomState(2) X = rng.randn(*shape) X_rdd = ArrayRDD(self.sc.parallelize(X, 4), bsize=block_size) return X, X_rdd
from splearn.grid_search import SparkGridSearchCV #data preprocess df = pd.read_csv("review.csv", header=None, encoding='latin1') df[0] = df[0].apply(lambda death: 0 if death <= 5 else 1) df = df.dropna() data = df[1] target = df[0] list = [] data_train, data_test, target_train, target_test = cross_validation.train_test_split( data, target, test_size=0.25, random_state=43) # train data toRDD train_x = sc.parallelize(data_train) train_y = sc.parallelize(target_train) train_x = ArrayRDD(train_x) train_y = ArrayRDD(train_y) Z = DictRDD((train_x, train_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # pipeline dist_pipeline = SparkPipeline(( ('vect', SparkHashingVectorizer(non_negative=True)), # hashingTF for NB ('tfidf', SparkTfidfTransformer()), # IDF ('clf', SparkMultinomialNB(alpha=0.05)) # NB )) # fit dist_pipeline.fit(Z, clf__classes=np.array([0, 1]))
def make_blobs(self, centers, n_samples, blocks=-1): X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) X_rdd = ArrayRDD(self.sc.parallelize(X)) return X, y, X_rdd
def test_dot(self): a = np.arange(200).reshape(20, 10) b = np.arange(200).reshape(10, 20) a_rdd = ArrayRDD(self.sc.parallelize(a)) assert_array_almost_equal(unpack(a_rdd.dot(b)), a.dot(b))