Example #1
0
    def test_auto_dtype(self):
        x = np.arange(80).reshape((40, 2))
        y = tuple(range(40))
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)),
                    list(range(10)))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple}))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(
            check_rdd_dtype(rdd, {
                'x': np.ndarray,
                'y': tuple,
                'z': tuple
            }))
Example #2
0
    def test_same_prediction(self):
        X, y, Z = self.make_regression(1, 100000)

        local = LinearRegression()
        dist = SparkLinearRegression()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z).predict(Z[:, 'X'])

        assert_true(check_rdd_dtype(y_dist, (np.ndarray, )))
        assert_array_almost_equal(y_local, y_dist.toarray())
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_same_prediction(self):
        X, y, Z = self.make_classification(4, 100000, nonnegative=True)

        local = MultinomialNB()
        dist = SparkMultinomialNB()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X'])

        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
        assert_array_almost_equal(y_local, y_dist.toarray())
    def test_same_prediction(self):
        X, y, Z = self.make_regression(1, 100000)

        local = LinearRegression()
        dist = SparkLinearRegression()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z).predict(Z[:, 'X'])

        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
        assert_array_almost_equal(y_local, y_dist.toarray())
Example #7
0
    def test_block_rdd_dict(self):
        n_partitions = 3
        n_samples = 57
        dicts = [{'a': i, 'b': float(i)**2} for i in range(n_samples)]
        data = self.sc.parallelize(dicts, n_partitions)

        block_data_5 = block(data, bsize=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))
        assert_array_almost_equal(blocks[0][0], np.arange(5))
        assert_array_almost_equal(blocks[0][1],
                                  np.arange(5, dtype=np.float)**2)
Example #8
0
    def test_block_rdd_dict(self):
        n_partitions = 3
        n_samples = 57
        dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)]
        data = self.sc.parallelize(dicts, n_partitions)

        block_data_5 = block(data, bsize=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))
        assert_array_almost_equal(blocks[0][0], np.arange(5))
        assert_array_almost_equal(blocks[0][1],
                                  np.arange(5, dtype=np.float) ** 2)
Example #9
0
    def test_same_transform_result(self):
        X, y, Z_rdd = self.make_classification(4, 1000, -1)
        X_rdd = Z_rdd[:, 'X']

        local = TfidfTransformer()
        dist = SparkTfidfTransformer()

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(Z_dist, sp.spmatrix))
        assert_array_almost_equal(Z_local.toarray(), Z_dist.toarray())
Example #10
0
    def test_sparse_matrix(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([sparse_row for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_true(sp.issparse(blocked_data.first()))

        expected_block = sp.vstack([sparse_row] * 10)
        assert_array_almost_equal(expected_block.toarray(),
                                  blocked_data.first().toarray())
Example #11
0
    def test_sparse_matrix(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([sparse_row for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_true(sp.issparse(blocked_data.first()))

        expected_block = sp.vstack([sparse_row] * 10)
        assert_array_almost_equal(expected_block.toarray(),
                                  blocked_data.first().toarray())
Example #12
0
    def test_array_bsize(self):
        n_partitions = 10
        n_samples = 107
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)

        block_data_5 = block(data, bsize=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))

        block_data_10 = block(data, bsize=10)
        blocks = block_data_10.collect()
        assert_true(all(len(b) <= 10 for b in blocks))
Example #13
0
    def test_array_bsize(self):
        n_partitions = 10
        n_samples = 107
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)

        block_data_5 = block(data, bsize=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))

        block_data_10 = block(data, bsize=10)
        blocks = block_data_10.collect()
        assert_true(all(len(b) <= 10 for b in blocks))
    def test_same_transform_result(self):
        X, y, Z_rdd = self.make_classification(4, 1000, -1)
        X_rdd = Z_rdd[:, 'X']

        local = TfidfTransformer()
        dist = SparkTfidfTransformer()

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(Z_dist, sp.spmatrix))
        assert_array_almost_equal(Z_local.toarray(),
                                  Z_dist.toarray())
    def test_transform_with_dtype(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X2 = X.transform(lambda x: x ** 2, column=0)
        assert_equal(X2.dtype, (np.ndarray, np.ndarray))

        X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0,
                         dtype=tuple)
        assert_equal(X2.dtype, (tuple, np.ndarray))
        assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray}))

        X2 = X.transform(lambda x: x ** 2, column=1, dtype=list)
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()),
                         column=[0, 1], dtype=(np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2),
                         column=[1, 0], dtype=(list, np.ndarray))
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
Example #16
0
    def test_transform_with_dtype(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X2 = X.transform(lambda x: x**2, column=0)
        assert_equal(X2.dtype, (np.ndarray, np.ndarray))

        X2 = X.transform(lambda x: tuple((x**2).tolist()),
                         column=0,
                         dtype=tuple)
        assert_equal(X2.dtype, (tuple, np.ndarray))
        assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray}))

        X2 = X.transform(lambda x: x**2, column=1, dtype=list)
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()),
                         column=[0, 1],
                         dtype=(np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2),
                         column=[1, 0],
                         dtype=(list, np.ndarray))
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
    def test_same_prediction(self):
        X, y, Z = self.make_classification(2, 80000)

        local = SGDClassifier(average=True)
        dist = SparkSGDClassifier(average=True)

        local.fit(X, y)
        dist.fit(Z, classes=np.unique(y))

        y_local = local.predict(X)
        y_dist = np.concatenate(dist.predict(Z[:, 'X']).collect())

        mismatch = y_local.shape[0] - np.count_nonzero(y_dist == y_local)
        mismatch_percent = float(mismatch) * 100 / y_local.shape[0]

        assert_true(mismatch_percent <= 1)
Example #18
0
    def test_blocks_size(self):
        n_partitions = 10
        n_samples = 1000

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        shapes = ArrayRDD(rdd).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 100))
        shapes = ArrayRDD(rdd, 5).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 5))
        shapes = ArrayRDD(rdd, 50).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 50))
        shapes = ArrayRDD(rdd, 250).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 100))
        shapes = ArrayRDD(rdd, 66).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.in1d(shapes, [66, 34])))
    def test_same_prediction(self):
        X, y, Z = self.make_classification(2, 80000)

        local = SGDClassifier(average=True)
        dist = SparkSGDClassifier(average=True)

        local.fit(X, y)
        dist.fit(Z, classes=np.unique(y))

        y_local = local.predict(X)
        y_dist = dist.predict(Z[:, 'X'])

        mismatch = y_local.shape[0] - np.count_nonzero(y_dist.toarray() == y_local)
        mismatch_percent = float(mismatch) * 100 / y_local.shape[0]

        assert_true(mismatch_percent <= 1)
        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
Example #20
0
    def test_blocks_size(self):
        n_partitions = 10
        n_samples = 1000

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        shapes = ArrayRDD(rdd).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 100))
        shapes = ArrayRDD(rdd, 5).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 5))
        shapes = ArrayRDD(rdd, 50).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 50))
        shapes = ArrayRDD(rdd, 250).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 100))
        shapes = ArrayRDD(rdd, 66).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.in1d(shapes, [66, 34])))
    def test_same_prediction(self):
        X, y, Z = self.make_classification(2, 80000)

        local = SGDClassifier(average=True)
        dist = SparkSGDClassifier(average=True)

        local.fit(X, y)
        dist.fit(Z, classes=np.unique(y))

        y_local = local.predict(X)
        y_dist = dist.predict(Z[:, 'X'])

        mismatch = y_local.shape[0] - np.count_nonzero(
            y_dist.toarray() == y_local)
        mismatch_percent = float(mismatch) * 100 / y_local.shape[0]

        assert_true(mismatch_percent <= 1)
        assert_true(check_rdd_dtype(y_dist, (np.ndarray, )))
Example #22
0
    def test_same_fit_transforms(self):
        X, X_rdd = self.make_dense_rdd((1e3, 12))

        n_components = 4
        random_state = 42
        tol = 1e-7
        local = TruncatedSVD(n_components, n_iter=5, tol=tol,
                             random_state=random_state)
        dist = SparkTruncatedSVD(n_components, n_iter=50, tol=tol,
                                 random_state=random_state)

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)
        Z_collected = Z_dist.toarray()
        assert_true(check_rdd_dtype(Z_dist, (np.ndarray,)))

        tol = 1e-1
        assert_array_equal(Z_local.shape, Z_collected.shape)
        assert(np.allclose(+Z_collected[:, 0], Z_local[:, 0], atol=tol) |
               np.allclose(-Z_collected[:, 0], Z_local[:, 0], atol=tol))
    def test_auto_dtype(self):
        x = np.arange(80).reshape((40, 2))
        y = tuple(range(40))
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)),
                    list(range(10)))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple}))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple,
                                          'z': tuple}))
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())
Example #25
0
    def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray, )))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())