def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X = np.ones(5) assert_array_equal(scale(X, with_mean=False), X)
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert_false(np.any(np.isnan(X_scaled))) X_csr_scaled = scale(X_csr, with_mean=False) assert_false(np.any(np.isnan(X_csr_scaled.data))) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def _bootstrap_pool(X, Y, X_saliences, Y_saliences, n_components,procrustes, algorithm, boot_i): """ basic version for parallel implementation of bootstrapping using pool """ #call random seed so not the same random number is used in each process np.random.seed( int( time() ) + boot_i) #choose indices to resample randomly with replacement for a sample of same size sample_indices = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True) X_boot = X[sample_indices,:] Y_boot = Y[sample_indices,:] X_boot_scaled = scale(X_boot) Y_boot_scaled = scale(Y_boot) covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot) X_saliences_boot = X_saliences_boot.T #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_boot) > len(Y_saliences_boot): #use procrustes_rotation on smaller dataset Y_bootstraps, rotation_matrix = _procrustes_rotation(Y_saliences, Y_saliences_boot) X_bootstraps = np.dot(X_saliences_boot, rotation_matrix) else: X_bootstraps, rotation_matrix = _procrustes_rotation(X_saliences, X_saliences_boot) Y_bootstraps = np.dot(Y_saliences_boot, rotation_matrix) #print np.shape(X_bootstraps) #print np.shape(Y_bootstraps) return X_bootstraps, Y_bootstraps
def test_scaler_1d(): # Test scaling of dataset along single axis rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X = np.ones(5) assert_array_equal(scale(X, with_mean=False), X)
def store_anm(progress_counter, list_data): # For loop for calculating the result for data_num in list_data: file_name = "datasets/" + data_num + ".txt" print(file_name + " In Progress (" + str( float("{0:.2f}".format(progress_counter / len(list_data) * 100))) + " % Done)") progress_counter += 1 df = pd.read_csv(file_name, delim_whitespace=True, header=None) df.columns = ["x", "y"] x = sk.scale(df['x'].tolist()).reshape((-1, 1)) y = sk.scale(df['y'].tolist()).reshape((-1, 1)) gp = sk_gp.GaussianProcessRegressor().fit(x, y) anm = anm_store.ANM_store() anm_result.append(anm.predict_proba(df['x'].tolist(), df['y'].tolist())) # indepscoreX_result.append(anm.anm_score(sk.scale(df['x'].tolist()).reshape((-1, 1)), sk.scale(df['y'].tolist()).reshape((-1, 1)))) # x -> y direction # indepscoreY_result.append(anm.anm_score(sk.scale(df['y'].tolist()).reshape((-1, 1)), # sk.scale(df['x'].tolist()).reshape((-1, 1)))) # y -> x direction y_predict_result.append(y_predict_calculator(x, gp)) y_predict_result.append(y_predict_calculator(y, gp)) y_predict_std.append(y_predict_calculator(x, gp, True, False)) y_predict_std.append(y_predict_calculator(y, gp, True, False)) y_predict_cov.append(y_predict_calculator(x, gp, False, True)) y_predict_cov.append(y_predict_calculator(y, gp, False, True))
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
def _boostrap(X, Y, X_saliences, Y_saliences, X_saliences_bootstraps, Y_saliences_bootstraps, bootstrap_i, n_components, algorithm="randomized"): sample_indices = np.random.choice(list(range(X.shape[0])), size=X.shape[0], replace=True) X_boot = X[sample_indices, :] Y_boot = Y[sample_indices, :] X_boot_scaled = scale(X_boot) Y_boot_scaled = scale(Y_boot) covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot) X_saliences_boot = X_saliences_boot.T #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_boot) > len(Y_saliences_boot): Y_saliences_bootstraps[:, :, bootstrap_i], rotation_matrix = _procrustes_rotation( Y_saliences, Y_saliences_boot) X_saliences_bootstraps[:, :, bootstrap_i] = np.dot(X_saliences_boot, rotation_matrix) else: X_saliences_bootstraps[:, :, bootstrap_i], rotation_matrix = _procrustes_rotation( X_saliences, X_saliences_boot) Y_saliences_bootstraps[:, :, bootstrap_i] = np.dot(Y_saliences_boot, rotation_matrix)
def test_scaler_1d(self): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # Test with sparse list X = scipy.sparse.coo_matrix((np.random.random((10,)), ([i**2 for i in range(10)], [0 for i in range(10)]))) X = X.tocsr() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled.data))) self.assertAlmostEqual(X_scaled.mean(), 0) self.assertAlmostEqual(np.sqrt(X_scaled.data.var()), 1) # Check that X has not been copied # self.assertTrue(X_scaled is X) # Check that the matrix is still sparse self.assertEqual(len(X.indices), 10)
def test_standard_scaler_numerical_stability(): """Test numerical stability of scaling""" # np.log(1e-5) is taken because of its floating point representation # was empirically found to cause numerical problems with np.mean & np.std. x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64) if LooseVersion(np.__version__) >= LooseVersion('1.9'): # This does not raise a warning as the number of samples is too low # to trigger the problem in recent numpy x_scaled = assert_no_warnings(scale, x) assert_array_almost_equal(scale(x), np.zeros(8)) else: w = "standard deviation of the data is probably very close to 0" x_scaled = assert_warns_message(UserWarning, w, scale, x) assert_array_almost_equal(x_scaled, np.zeros(8)) # with 2 more samples, the std computation run into numerical issues: x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64) w = "standard deviation of the data is probably very close to 0" x_scaled = assert_warns_message(UserWarning, w, scale, x) assert_array_almost_equal(x_scaled, np.zeros(10)) x = np.ones(10, dtype=np.float64) * 1e-100 x_small_scaled = assert_no_warnings(scale, x) assert_array_almost_equal(x_small_scaled, np.zeros(10)) # Large values can cause (often recoverable) numerical stability issues: x_big = np.ones(10, dtype=np.float64) * 1e100 w = "Dataset may contain too large values" x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big) assert_array_almost_equal(x_big_scaled, np.zeros(10)) assert_array_almost_equal(x_big_scaled, x_small_scaled) x_big_centered = assert_warns_message(UserWarning, w, scale, x_big, with_std=False) assert_array_almost_equal(x_big_centered, np.zeros(10)) assert_array_almost_equal(x_big_centered, x_small_scaled)
def pca_handler(data): data = data.apply(preprocessing.LabelEncoder().fit_transform) data = data.astype('float64') s_kmeans = KMeans(n_clusters=5).fit(data) pca_dataset = PCA().fit(scale(data)) return pca_dataset
def test_scaler_2d_arrays(self): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied self.assertTrue(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) self.assertTrue(X_scaled_back is not X) self.assertTrue(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified self.assertTrue(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied self.assertTrue(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied self.assertTrue(X_scaled is not X) # Same thing for sparse matrices... X = scipy.sparse.coo_matrix((np.random.random((12,)), ([i for i in range(12)], [int(i / 3) for i in range(12)]))) X = X.tocsr() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled.data))) assert_array_almost_equal( [X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].mean() for i in range(X_scaled.shape[1])], np.zeros((4, ), dtype=np.float64)) assert_array_almost_equal(np.sqrt([ X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].var() for i in range(X_scaled.shape[1])]), np.ones((4, ), dtype=np.float64)) # Because we change the sparse format to csc, we cannot assert that # the matrix did not change! # self.assertTrue(X_scaled is X) # Check that the matrix is still sparse self.assertEqual(len(X.indices), 12)
def test_scaler_2d_arrays(self): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied self.assertTrue(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) self.assertTrue(X_scaled_back is not X) self.assertTrue(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified self.assertTrue(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied self.assertTrue(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied self.assertTrue(X_scaled is not X) # Same thing for sparse matrices... X = scipy.sparse.coo_matrix((np.random.random( (12, )), ([i for i in range(12)], [int(i / 3) for i in range(12)]))) X = X.tocsr() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled.data))) assert_array_almost_equal([ X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].mean() for i in range(X_scaled.shape[1]) ], np.zeros((4, ), dtype=np.float64)) assert_array_almost_equal( np.sqrt([ X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].var() for i in range(X_scaled.shape[1]) ]), np.ones((4, ), dtype=np.float64)) # Because we change the sparse format to csc, we cannot assert that # the matrix did not change! # self.assertTrue(X_scaled is X) # Check that the matrix is still sparse self.assertEqual(len(X.indices), 12)