コード例 #1
0
    def test_scaler_1d(self):
        """Test scaling of dataset along single axis"""
        rng = np.random.RandomState(0)
        X = rng.randn(5)
        X_orig_copy = X.copy()

        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)
        assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
        assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

        # check inverse transform
        X_scaled_back = scaler.inverse_transform(X_scaled)
        assert_array_almost_equal(X_scaled_back, X_orig_copy)

        # Test with 1D list
        X = [0., 1., 2, 0.4, 1.]
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)
        assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
        assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

        X_scaled = scale(X)
        assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
        assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

        # Test with sparse list
        X = scipy.sparse.coo_matrix((np.random.random((10,)),
                                     ([i**2 for i in range(10)],
                                      [0 for i in range(10)])))
        X = X.tocsr()
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)

        self.assertFalse(np.any(np.isnan(X_scaled.data)))
        self.assertAlmostEqual(X_scaled.mean(), 0)
        self.assertAlmostEqual(np.sqrt(X_scaled.data.var()), 1)

        # Check that X has not been copied
        # self.assertTrue(X_scaled is X)
        # Check that the matrix is still sparse
        self.assertEqual(len(X.indices), 10)
コード例 #2
0
    def test_scaler_2d_arrays(self):
        """Test scaling of 2d array along first axis"""
        rng = np.random.RandomState(0)
        X = rng.randn(4, 5)
        X[:, 0] = 0.0  # first feature is always of zero

        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=True)
        self.assertFalse(np.any(np.isnan(X_scaled)))

        assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
        # Check that X has been copied
        self.assertTrue(X_scaled is not X)

        # check inverse transform
        X_scaled_back = scaler.inverse_transform(X_scaled)
        self.assertTrue(X_scaled_back is not X)
        self.assertTrue(X_scaled_back is not X_scaled)
        assert_array_almost_equal(X_scaled_back, X)

        X_scaled = scale(X, axis=1, with_std=False)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
        X_scaled = scale(X, axis=1, with_std=True)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
        # Check that the data hasn't been modified
        self.assertTrue(X_scaled is not X)

        X_scaled = scaler.fit(X).transform(X, copy=False)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
        # Check that X has not been copied
        self.assertTrue(X_scaled is X)

        X = rng.randn(4, 5)
        X[:, 0] = 1.0  # first feature is a constant, non zero feature
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=True)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
        # Check that X has not been copied
        self.assertTrue(X_scaled is not X)

        # Same thing for sparse matrices...
        X = scipy.sparse.coo_matrix((np.random.random((12,)),
                                     ([i for i in range(12)],
                                      [int(i / 3) for i in range(12)])))
        X = X.tocsr()
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)

        self.assertFalse(np.any(np.isnan(X_scaled.data)))
        assert_array_almost_equal(
            [X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].mean()
             for i in range(X_scaled.shape[1])],
                                  np.zeros((4, ), dtype=np.float64))
        assert_array_almost_equal(np.sqrt([
            X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].var()
            for i in range(X_scaled.shape[1])]),
                                  np.ones((4, ), dtype=np.float64))

        # Because we change the sparse format to csc, we cannot assert that
        # the matrix did not change!
        # self.assertTrue(X_scaled is X)
        # Check that the matrix is still sparse
        self.assertEqual(len(X.indices), 12)