def test_same_prediction(self):
        X, y, Z = self.make_classification(4, 100000, nonnegative=True)

        local = MultinomialNB()
        dist = SparkMultinomialNB()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X'])

        assert_array_almost_equal(y_local, np.concatenate(y_dist.collect()))
    def test_same_coefs(self):
        X, y, Z = self.make_classification(2, 10000)

        local = LogisticRegression(tol=1e-4, C=10)
        dist = SparkLogisticRegression(tol=1e-4, C=10)

        local.fit(X, y)
        dist.fit(Z, classes=np.unique(y))

        assert_array_almost_equal(local.coef_, dist.coef_, decimal=1)
    def test_same_coefs(self):
        X, y, Z = self.make_classification(2, 100000)

        local = LinearSVC()
        dist = SparkLinearSVC()

        local.fit(X, y)
        dist.fit(Z, classes=np.unique(y))

        assert_array_almost_equal(local.coef_, dist.coef_, decimal=3)
    def test_same_coefs(self):
        X, y, Z = self.make_regression(1, 100000)

        local = LinearRegression()
        dist = SparkLinearRegression()

        local.fit(X, y)
        dist.fit(Z)

        assert_array_almost_equal(local.coef_, dist.coef_)
        assert_array_almost_equal(local.intercept_, dist.intercept_)
    def test_same_prediction(self):
        X, y, Z = self.make_classification(4, 100000, nonnegative=True)

        local = MultinomialNB()
        dist = SparkMultinomialNB()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X'])

        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
        assert_array_almost_equal(y_local, y_dist.toarray())
    def test_same_coefs(self):
        X, y, Z = self.make_regression(1, 100000)

        local = LinearRegression()
        dist = SparkLinearRegression()

        local.fit(X, y)
        dist.fit(Z)

        assert_array_almost_equal(local.coef_, dist.coef_)
        assert_array_almost_equal(local.intercept_, dist.intercept_)
    def test_same_prediction(self):
        X, y, Z = self.make_regression(1, 100000)

        local = LinearRegression()
        dist = SparkLinearRegression()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z).predict(Z[:, 'X'])

        assert_true(check_rdd_dtype(y_dist, (np.ndarray, )))
        assert_array_almost_equal(y_local, y_dist.toarray())
    def test_same_prediction(self):
        X, y, Z = self.make_regression(1, 100000)

        local = LinearRegression()
        dist = SparkLinearRegression()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z).predict(Z[:, 'X'])

        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
        assert_array_almost_equal(y_local, y_dist.toarray())
Exemple #9
0
    def test_sparse_matrix(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([sparse_row for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_true(sp.issparse(blocked_data.first()))

        expected_block = sp.vstack([sparse_row] * 10)
        assert_array_almost_equal(expected_block.toarray(),
                                  blocked_data.first().toarray())
    def test_same_prediction(self):
        X, y, Z = self.make_classification(2, 800000, nonnegative=True)

        local = GaussianNB()
        dist = SparkGaussianNB()

        local_model = local.fit(X, y)
        dist_model = dist.fit(Z, classes=np.unique(y))

        # TODO: investigate the variance further!
        assert_array_almost_equal(local_model.sigma_, dist_model.sigma_, 2)
        assert_array_almost_equal(local_model.theta_, dist_model.theta_, 6)
Exemple #11
0
    def test_block_rdd_dict(self):
        n_partitions = 3
        n_samples = 57
        dicts = [{'a': i, 'b': float(i)**2} for i in range(n_samples)]
        data = self.sc.parallelize(dicts, n_partitions)

        block_data_5 = block(data, bsize=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))
        assert_array_almost_equal(blocks[0][0], np.arange(5))
        assert_array_almost_equal(blocks[0][1],
                                  np.arange(5, dtype=np.float)**2)
 def _test_func_on_axis(self, func, toarray=True):
     X, X_rdd = self.make_sparse_rdd(block_size=100)
     assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)())
     for axes in (0, 1):
         if toarray:
             assert_array_almost_equal(
                 getattr(X_rdd, func)(axis=axes).toarray(),
                 getattr(X, func)(axis=axes).toarray())
         else:
             assert_array_almost_equal(
                 getattr(X_rdd, func)(axis=axes),
                 getattr(X, func)(axis=axes))
Exemple #13
0
 def _test_func_on_axis(self, func, toarray=True):
     X, X_rdd = self.make_sparse_rdd(block_size=100)
     assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)())
     for axes in (0, 1):
         if toarray:
             assert_array_almost_equal(
                 getattr(X_rdd, func)(axis=axes).toarray(),
                 getattr(X, func)(axis=axes).toarray())
         else:
             assert_array_almost_equal(
                 getattr(X_rdd, func)(axis=axes),
                 getattr(X, func)(axis=axes))
Exemple #14
0
    def test_block_rdd_dict(self):
        n_partitions = 3
        n_samples = 57
        dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)]
        data = self.sc.parallelize(dicts, n_partitions)

        block_data_5 = block(data, bsize=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))
        assert_array_almost_equal(blocks[0][0], np.arange(5))
        assert_array_almost_equal(blocks[0][1],
                                  np.arange(5, dtype=np.float) ** 2)
Exemple #15
0
    def test_sparse_matrix(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([sparse_row for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_true(sp.issparse(blocked_data.first()))

        expected_block = sp.vstack([sparse_row] * 10)
        assert_array_almost_equal(expected_block.toarray(),
                                  blocked_data.first().toarray())
    def _test_func_on_axis(self, func):
        X, X_rdd = self.make_dense_rdd(block_size=100)
        assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)())
        for axes in (0, 1):
            assert_array_almost_equal(getattr(X_rdd, func)(axis=axes),
                                      getattr(X, func)(axis=axes))

        X, X_rdd = self.make_dense_rdd((100, 3, 2))
        assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)())
        for axes in (0, 1, 2):
            assert_array_almost_equal(getattr(X_rdd, func)(axis=axes),
                                      getattr(X, func)(axis=axes))
Exemple #17
0
    def test_same_transform_result(self):
        X, y, Z_rdd = self.make_classification(4, 1000, -1)
        X_rdd = Z_rdd[:, 'X']

        local = TfidfTransformer()
        dist = SparkTfidfTransformer()

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(Z_dist, sp.spmatrix))
        assert_array_almost_equal(Z_local.toarray(), Z_dist.toarray())
Exemple #18
0
    def test_same_transform_result(self):
        X, y, Z_rdd = self.make_classification(4, 1000, None)
        X_rdd = Z_rdd[:, 'X']

        local = TfidfTransformer()
        dist = SparkTfidfTransformer()

        Z_local = local.fit_transform(X)
        Z_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

        assert_array_almost_equal(Z_local.toarray(),
                                  Z_dist.toarray())
    def test_same_centroids(self):
        X, y, X_rdd = self.make_blobs(centers=4, n_samples=200000)

        local = KMeans(n_clusters=4, init='k-means++', random_state=42)
        dist = SparkKMeans(n_clusters=4, init='k-means++', random_state=42)

        local.fit(X)
        dist.fit(X_rdd)

        local_centers = np.sort(local.cluster_centers_, axis=0)
        dist_centers = np.sort(dist.cluster_centers_, axis=0)

        assert_array_almost_equal(local_centers, dist_centers, decimal=4)
    def test_same_centroids(self):
        X, y, X_rdd = self.make_blobs(centers=4, n_samples=200000)

        local = KMeans(n_clusters=4, init='k-means++', random_state=42)
        dist = SparkKMeans(n_clusters=4, init='k-means++', random_state=42)

        local.fit(X)
        dist.fit(X_rdd)

        local_centers = np.sort(local.cluster_centers_, axis=0)
        dist_centers = np.sort(dist.cluster_centers_, axis=0)

        assert_array_almost_equal(local_centers, dist_centers, decimal=4)
    def test_same_transform_result(self):
        X, y, Z_rdd = self.make_classification(4, 1000, -1)
        X_rdd = Z_rdd[:, 'X']

        local = TfidfTransformer()
        dist = SparkTfidfTransformer()

        Z_local = local.fit_transform(X)
        Z_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(Z_dist, sp.spmatrix))
        assert_array_almost_equal(Z_local.toarray(),
                                  Z_dist.toarray())
Exemple #22
0
    def _test_func_on_axis(self, func):
        X, X_rdd = self.make_dense_rdd(block_size=100)
        assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)())
        for axes in (0, 1):
            assert_array_almost_equal(
                getattr(X_rdd, func)(axis=axes),
                getattr(X, func)(axis=axes))

        X, X_rdd = self.make_dense_rdd((100, 3, 2))
        assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)())
        for axes in (0, 1, 2):
            assert_array_almost_equal(
                getattr(X_rdd, func)(axis=axes),
                getattr(X, func)(axis=axes))
 def test_svd(self):
     X, X_rdd = self.make_dense_rdd()
     u, s, v = svd(X_rdd, 1)
     u = np.squeeze(np.concatenate(np.array(u.collect()))).T
     u_true, s_true, v_true = ln.svd(X)
     assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]))
     assert_array_almost_equal(s[0], s_true[0])
     assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
Exemple #24
0
 def test_svd(self):
     X, X_rdd = self.make_dense_rdd()
     u, s, v = svd(X_rdd, 1)
     u = np.squeeze(np.concatenate(np.array(u.collect()))).T
     u_true, s_true, v_true = ln.svd(X)
     assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]))
     assert_array_almost_equal(s[0], s_true[0])
     assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
Exemple #25
0
    def test_block_rdd_tuple(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([(np.array([1., 2.]), 0, sparse_row)
                                    for i in range(n_samples)], n_partitions)
        blocked_data = block(data)

        expected_first_block = np.array([[1., 2.]] * 10)
        expected_second_block = np.zeros(10, dtype=np.int)
        expected_third_block = sp.vstack([sparse_row] * 10)

        first_block_tuple = blocked_data.first()
        assert_array_almost_equal(expected_first_block, first_block_tuple[0])
        assert_array_almost_equal(expected_second_block, first_block_tuple[1])
        assert_array_almost_equal(expected_third_block.toarray(),
                                  first_block_tuple[2].toarray())

        tuple_blocks = blocked_data.collect()
        assert_equal(len(tuple_blocks), n_partitions)
        assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples)
        assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
Exemple #26
0
    def test_block_rdd_tuple(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize(
            [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)],
            n_partitions)
        blocked_data = block(data)

        expected_first_block = np.array([[1., 2.]] * 10)
        expected_second_block = np.zeros(10, dtype=np.int)
        expected_third_block = sp.vstack([sparse_row] * 10)

        first_block_tuple = blocked_data.first()
        assert_array_almost_equal(expected_first_block, first_block_tuple[0])
        assert_array_almost_equal(expected_second_block, first_block_tuple[1])
        assert_array_almost_equal(expected_third_block.toarray(),
                                  first_block_tuple[2].toarray())

        tuple_blocks = blocked_data.collect()
        assert_equal(len(tuple_blocks), n_partitions)
        assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples)
        assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
Exemple #27
0
 def test_dot(self):
     A, A_rdd = self.make_sparse_rdd((20, 10))
     B, B_rdd = self.make_sparse_rdd((10, 20))
     assert_array_almost_equal(A_rdd.dot(B).toarray(), A.dot(B).toarray())
     assert_array_almost_equal(B_rdd.dot(A).toarray(), B.dot(A).toarray())
 def test_dot(self):
     A, A_rdd = self.make_sparse_rdd((20, 10))
     B, B_rdd = self.make_sparse_rdd((10, 20))
     assert_array_almost_equal(A_rdd.dot(B).toarray(), A.dot(B).toarray())
     assert_array_almost_equal(B_rdd.dot(A).toarray(), B.dot(A).toarray())
Exemple #29
0
 def test_dot(self):
     a = np.arange(200).reshape(20, 10)
     b = np.arange(200).reshape(10, 20)
     a_rdd = ArrayRDD(self.sc.parallelize(a))
     assert_array_almost_equal(unpack(a_rdd.dot(b)), a.dot(b))
Exemple #30
0
 def test_dot_sparse(self):
     a, a_rdd = self.generate_sparse_dataset(shape=(10, 20))
     b = sp.rand(20, 10, random_state=2, density=0.1)
     assert_array_almost_equal(unpack(a_rdd.dot(b)).toarray(),
                               a.dot(b).toarray())
Exemple #31
0
 def test_mean_sparse(self):
     data, rdd = self.generate_sparse_dataset()
     assert_almost_equal(ArrayRDD(rdd).mean(), data.mean())
     assert_array_almost_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0))
     assert_array_almost_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))