def test_deprecations(): with pytest.warns(FutureWarning, match="deprecated in 0.22"): gaussian_random_matrix(10, 100) with pytest.warns(FutureWarning, match="deprecated in 0.22"): sparse_random_matrix(10, 100)
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([("imputer", Imputer(missing_values=0)), ("tree", tree.DecisionTreeRegressor(random_state=0))]) parameters = {"imputer__strategy": ["mean", "median", "most_frequent"], "imputer__axis": [0, 1]} l = 100 X = sparse_random_matrix(l, l, density=0.10) Y = sparse_random_matrix(l, 1, density=0.10).toarray() gs = grid_search.GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]} X = sparse_random_matrix(100, 100, density=0.10) Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"] } X = sparse_random_matrix(100, 100, density=0.10) Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_imputation_pipeline_grid_search(self): """Test imputation within a pipeline + gridsearch.""" pipeline = Pipeline([('imputer', Imputer(missing_values=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"], 'imputer__axis': [0, 1] } l = 100 X = sparse_random_matrix(l, l, density=0.10) Y = sparse_random_matrix(l, 1, density=0.10).toarray() gs = grid_search.GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_imputation_pipeline_grid_search(): """Test imputation within a pipeline + gridsearch.""" pipeline = Pipeline([('imputer', Imputer(missing_values=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"], 'imputer__axis': [0, 1] } l = 100 X = sparse_random_matrix(l, l, density=0.10) Y = sparse_random_matrix(l, 1, density=0.10).todense() gs = grid_search.GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_mice_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by MICEImputer imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1)
def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 max_iter = 2 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer imputer = IterativeImputer(missing_values=0, max_iter=max_iter, n_nearest_features=5, sample_posterior=False, min_value=0, max_value=1, verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] assert (len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1)
def check_alternative_lrap_implementation(lrap_score, n_classes=5, n_samples=20, random_state=0): _, y_true = make_multilabel_classification(n_features=1, allow_unlabeled=False, random_state=random_state, n_classes=n_classes, n_samples=n_samples) # Score with ties y_score = sparse_random_matrix(n_components=y_true.shape[0], n_features=y_true.shape[1], random_state=random_state) if hasattr(y_score, "toarray"): y_score = y_score.toarray() score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap) # Uniform score random_state = check_random_state(random_state) y_score = random_state.uniform(size=(n_samples, n_classes)) score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap)
def test3(rnd_stream): test_name = 'test3' print('##########################################################') print("Starting test '{0}'".format(test_name)) t3_sigma = 0.1 t3_ntiles = 10 t3_ntilings = 500 phi = TileCoding(input_indices=[[0, 1, 2]], ntiles=[t3_ntiles], ntilings=[t3_ntilings], state_range=[[0] * 3, [1.] * 3], rnd_stream=np.random, bias_term=False, hashing=None) X, y, f, test_x = create_test_data3(rnd_stream) random_proj = rp.sparse_random_matrix(100, phi.size) gps = generate_rp_tilecoding_gps(X, y, random_proj=random_proj, sigma=t3_sigma, tilecoding=phi, include_sparse=False) fit_gps(test_name, gps) errors = compare_nll(gps) errors = errors + compare_mean_var(test_x, gps) print("Ending test '{0}' with {1} errors".format(test_name, errors)) return errors
def fit(self, X, y): if self.activation is None: # Useful to quantify the impact of the non-linearity self._activate = lambda x: x else: self._activate = self.activations[self.activation] rng = check_random_state(self.random_state) # one-of-K coding for output values self.classes_ = unique_labels(y) Y = label_binarize(y, self.classes_) # set hidden layer parameters randomly n_features = X.shape[1] if self.rank is None: if self.density == 1: self.weights_ = rng.randn(n_features, self.n_hidden) else: self.weights_ = sparse_random_matrix(self.n_hidden, n_features, density=self.density, random_state=rng).T else: # Low rank weight matrix self.weights_u_ = rng.randn(n_features, self.rank) self.weights_v_ = rng.randn(self.rank, self.n_hidden) self.biases_ = rng.randn(self.n_hidden) # map the input data through the hidden layer H = self.transform(X) # fit the linear model on the hidden layer activation self.beta_ = np.dot(pinv2(H), Y) return self
def test_mice_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by MICEImputer imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1)
def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 max_iter = 2 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer imputer = IterativeImputer(missing_values=0, max_iter=max_iter, n_nearest_features=5, sample_posterior=False, min_value=0, max_value=1, verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] assert (len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1)
def check_alternative_lrap_implementation(lrap_score, n_classes=5, n_samples=20, random_state=0): _, y_true = make_multilabel_classification(n_features=1, allow_unlabeled=False, random_state=random_state, n_classes=n_classes, n_samples=n_samples) # Score with ties y_score = sparse_random_matrix(n_components=y_true.shape[0], n_features=y_true.shape[1], random_state=random_state) if hasattr(y_score, "toarray"): y_score = y_score.toarray() score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap) # Uniform score random_state = check_random_state(random_state) y_score = random_state.uniform(size=(n_samples, n_classes)) score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap)
def test_mice_imputation_order(): n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10).toarray() X[:, 0] = 1 # this column shouldn't be ever used for imputation_order in [ 'random', 'roman', 'monotone', 'revmonotone', 'arabic' ]: imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, n_nearest_features=5, min_value=0, max_value=1, verbose=False, imputation_order=imputation_order) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) # Checks that the return type is ok X2 = as_float_array(X, copy=False) np.testing.assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert_true(as_float_array(X, False) is not X) # Checking that the new type is ok np.testing.assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert_true(as_float_array(X, copy=False) is X) # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert_true(np.isfortran(as_float_array(X, copy=True))) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert_false(np.isnan(M).any())
def fit(self, X, y): if self.activation is None: # Useful to quantify the impact of the non-linearity self._activate = lambda x: x else: self._activate = self.activations[self.activation] rng = check_random_state(self.random_state) # one-of-K coding for output values self.classes_ = unique_labels(y) Y = label_binarize(y, self.classes_) # set hidden layer parameters randomly n_features = X.shape[1] if self.rank is None: if self.density == 1: self.weights_ = rng.randn(n_features, self.n_hidden) else: self.weights_ = sparse_random_matrix( self.n_hidden, n_features, density=self.density, random_state=rng).T else: # Low rank weight matrix self.weights_u_ = rng.randn(n_features, self.rank) self.weights_v_ = rng.randn(self.rank, self.n_hidden) self.biases_ = rng.randn(self.n_hidden) # map the input data through the hidden layer H = self.transform(X) # fit the linear model on the hidden layer activation self.beta_ = np.dot(pinv2(H), Y) return self
def __init__(self, n_features=784, n_components=20, random_state=None): super().__init__() self.n_features = n_features self.n_components = n_components self.decode = torch.Tensor( sparse_random_matrix(self.n_components, self.n_features, random_state=random_state).todense()).float()
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_true(np.all(X == Xt)) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=0 => no copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=1 => copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=1, missing_values=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) assert_false(sparse.issparse(Xt))
def test_calc_k_mean_sparse(self): n_samples = 1000 n_features = 30 n_components = 20 data = sparse_random_matrix(n_samples, n_features, density=0.01, random_state=42) actual_output = calc_k_mean(data, n_clusters=n_components, sparse=True) self.assertEqual(actual_output[0].shape, (n_samples,)) self.assertEqual(actual_output[1].shape, (n_components, n_features)) print(actual_output[1])
def test_deprecated_imputer_axis(): depr_message = ("Parameter 'axis' has been deprecated in 0.20 and will " "be removed in 0.22. Future (and default) behavior is " "equivalent to 'axis=0' (impute along columns). Row-wise " "imputation can be performed with FunctionTransformer.") X = sparse_random_matrix(5, 5, density=0.75, random_state=0) imputer = Imputer(missing_values=0, axis=0) assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X) imputer = Imputer(missing_values=0, axis=1) assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
def test_deprecated_imputer_axis(): depr_message = ("Parameter 'axis' has been deprecated in 0.20 and will " "be removed in 0.22. Future (and default) behavior is " "equivalent to 'axis=0' (impute along columns). Row-wise " "imputation can be performed with FunctionTransformer.") X = sparse_random_matrix(5, 5, density=0.75, random_state=0) imputer = Imputer(missing_values=0, axis=0) assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X) imputer = Imputer(missing_values=0, axis=1) assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def get_tilegp(self): nlayers = self.options['nlayers'] indices = [] ntiles = [] hashing = None all_cat = np.unique(self.z) if self.tilecap: hashing_mem = self.tilecap / len(all_cat) / nlayers hashing = [rp.UNH(hashing_mem) for _ in xrange(len(all_cat))] for a in all_cat: inds = helper.find(self.z == a) indices.append(inds) ntiles.append(self.k[inds]) phi = TileCoding( input_indices=indices, # ntiles = input dim x number of layers x tilings ntiles=ntiles, ntilings=[nlayers] * len(indices), hashing=hashing, state_range=self.x_range, rnd_stream=np.random, bias_term=False) if self.gp_type == 'sk': # densekern > sparsekern \approx sparserp sparsekern = SparseKernel(phi, normalize=True) gp = SparseKernelGP(self.X, self.y, sigma=0.1, kern=sparsekern) elif self.gp_type == 'sf': # too slow sparsephi = IndexToBinarySparse(phi, normalize=True) gp = SparseFeatureGP(self.X, self.y, sigma=0.1, phi=sparsephi) elif self.gp_type == 'dk': densekern = DenseKernel(phi, normalize=True) gp = DenseKernelGP(self.X, self.y, sigma=self.sigma, kern=densekern) else: random_proj = rp.sparse_random_matrix(300, phi.size, random_state=np.random) densephi = SparseRPTilecoding(phi, random_proj=random_proj, normalize=True, output_dense=True) gp = DenseFeatureGP(self.X, self.y, sigma=self.sigma, phi=densephi) gp.fit() return gp
def test_mice_predictors(): from sklearn.dummy import DummyRegressor from sklearn.linear_model import BayesianRidge n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10).toarray() for predictor in [DummyRegressor, BayesianRidge]: imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor()) imputer.fit_transform(X)
def main(): """ Create low-dimensional and sparse random matrix from vocabulary file. """ # Get the arguments args = docopt( '''Create low-dimensional and sparse random matrix from vocabulary file. Usage: random.py <vocabFile> <outPath> <dim> <vocabFile> = row and column vocabulary <outPath> = output path for random matrix <dim> = dimensionality for random vectors Note: Calculates number of seeds automatically as proposed in [1,2] References: [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf ''') #np.random.seed(0) # uncomment for reproducibility vocabFile = args['<vocabFile>'] outPath = args['<outPath>'] dim = int(args['<dim>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load vocabulary logging.info("Loading vocabulary") with open(vocabFile, 'r', encoding='utf-8') as f_in: vocabulary = [line.strip() for line in f_in] # Generate random vectors randomMatrix = sparse_random_matrix(dim, len(vocabulary)).toarray().T # Store random matrix Space(matrix=randomMatrix, rows=vocabulary, columns=[]).save(outPath) logging.info("--- %s seconds ---" % (time.time() - start_time))
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data))
def test_sparse_random_matrix(): """Check some statical properties of sparse random matrix""" n_components = 100 n_features = 500 for density in [0.3, 1.]: s = 1 / density A = sparse_random_matrix(n_components, n_features, density=density, random_state=0) A = densify(A) # Check possible values values = np.unique(A) assert_in(np.sqrt(s) / np.sqrt(n_components), values) assert_in(-np.sqrt(s) / np.sqrt(n_components), values) if density == 1.0: assert_equal(np.size(values), 2) else: assert_in(0., values) assert_equal(np.size(values), 3) # Check that the random matrix follow the proper distribution. # Let's say that each element of a_{ij} of A is taken from # # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s # - 0 with probability 1 - 1 / s # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s # assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_transform_stochasticity(): pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
def test_iterative_imputer_transform_stochasticity(): pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
def test_sparse_random_matrix(): # Check some statical properties of sparse random matrix n_components = 100 n_features = 500 for density in [0.3, 1.]: s = 1 / density A = sparse_random_matrix(n_components, n_features, density=density, random_state=0) A = densify(A) # Check possible values values = np.unique(A) assert_in(np.sqrt(s) / np.sqrt(n_components), values) assert_in(- np.sqrt(s) / np.sqrt(n_components), values) if density == 1.0: assert_equal(np.size(values), 2) else: assert_in(0., values) assert_equal(np.size(values), 3) # Check that the random matrix follow the proper distribution. # Let's say that each element of a_{ij} of A is taken from # # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s # - 0 with probability 1 - 1 / s # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s # assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.mean(A == - np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == - np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
def test_imputation_pickle(): """Test for pickling imputers.""" import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal(imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_mice_transform_stochasticity(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=rng) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_imputation_pickle(): # Test for pickling imputers. import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_mice_transform_stochasticity(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=rng) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) X2 = as_float_array(X, copy=False) assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert as_float_array(X, False) is not X assert_equal(X2.dtype, np.float64) # Test int dtypes <= 32bit tested_dtypes = [ np.bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32 ] for dtype in tested_dtypes: X = X.astype(dtype) X2 = as_float_array(X) assert_equal(X2.dtype, np.float32) # Test object dtype X = X.astype(object) X2 = as_float_array(X, copy=True) assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert as_float_array(X, copy=False) is X # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert np.isfortran(as_float_array(X, copy=True)) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert not np.isnan(M).any()
def test_mice_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([('imputer', MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, random_state=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__initial_strategy': ["mean", "median", "most_frequent"] } n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.50).toarray() Y = np.random.random((n, d)) gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) X2 = as_float_array(X, copy=False) assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert as_float_array(X, False) is not X assert_equal(X2.dtype, np.float64) # Test int dtypes <= 32bit tested_dtypes = [np.bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32] for dtype in tested_dtypes: X = X.astype(dtype) X2 = as_float_array(X) assert_equal(X2.dtype, np.float32) # Test object dtype X = X.astype(object) X2 = as_float_array(X, copy=True) assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert as_float_array(X, copy=False) is X # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert np.isfortran(as_float_array(X, copy=True)) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert not np.isnan(M).any()
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer(missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order='random', random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer(missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order='random', random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)
def test_mice_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor, random_state=rng) imputer.fit_transform(X) # check that types are correct for predictors hashes = [] for triplet in imputer.imputation_sequence_: assert triplet.predictor hashes.append(id(triplet.predictor)) # check that each predictor is unique assert len(set(hashes)) == len(hashes)
def test_mice_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor, random_state=rng) imputer.fit_transform(X) # check that types are correct for predictors hashes = [] for triplet in imputer.imputation_sequence_: assert triplet.predictor hashes.append(id(triplet.predictor)) # check that each predictor is unique assert len(set(hashes)) == len(hashes)
def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X) # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) assert isinstance(triplet.estimator, expected_type) hashes.append(id(triplet.estimator)) # check that each estimator is unique assert len(set(hashes)) == len(hashes)
def test_calc_pca_time(self): n_samples = 10000 n_features = 1000 n_components = 20 num_iter = 10 data = sparse_random_matrix(n_samples, n_features, density=0.01, random_state=42) data = data.toarray() start_time = time.time() for _ in range(num_iter): actual_output = calc_sparse_pca(data, n_components=n_components) end_time = time.time() sparse_pca_elapsed_time = (end_time-start_time) / num_iter print("Elapsed time for calc_sparse_pca is %f" %(sparse_pca_elapsed_time)) start_time = time.time() for _ in range(num_iter): actual_output = calc_pca(data, n_components=n_components) end_time = time.time() pca_elapsed_time = (end_time-start_time) / num_iter print("Elapsed time for calc_pca is %f" %(pca_elapsed_time)) self.assertGreater(pca_elapsed_time, sparse_pca_elapsed_time)
def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X) # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) assert isinstance(triplet.estimator, expected_type) hashes.append(id(triplet.estimator)) # check that each estimator is unique assert len(set(hashes)) == len(hashes)
def test_imputation_pickle(): # Test for pickling imputers. import pickle n = 100 X = sparse_random_matrix(n, n, density=0.10).todense() for strategy in ["mean", "median", "most_frequent", "mice"]: if strategy == 'mice': imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1) else: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_almost_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), err_msg="Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_imputation_copy(): """Test imputation with copy=True.""" l = 5 # Test default behaviour and with copy=True for params in [{}, {'copy': True}]: X = sparse_random_matrix(l, l, density=0.75, random_state=0) # Dense imputer = Imputer(missing_values=0, strategy="mean", **params) Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X.todense() == Xt)) # Sparse imputer = Imputer(missing_values=0, strategy="mean", **params) X = X.todense() Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X == Xt))
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
boston.data = boston.data[perm] boston.target = boston.target[perm] digits = datasets.load_digits() perm = rng.permutation(digits.target.size) digits.data = digits.data[perm] digits.target = digits.target[perm] random_state = check_random_state(0) X_multilabel, y_multilabel = datasets.make_multilabel_classification( random_state=0, n_samples=30, n_features=10) X_sparse_pos = random_state.uniform(size=(20, 5)) X_sparse_pos[X_sparse_pos <= 0.8] = 0. y_random = random_state.randint(0, 4, size=(20, )) X_sparse_mix = sparse_random_matrix(20, 10, density=0.25, random_state=0) DATASETS = { "iris": {"X": iris.data, "y": iris.target}, "boston": {"X": boston.data, "y": boston.target}, "digits": {"X": digits.data, "y": digits.target}, "toy": {"X": X, "y": y}, "clf_small": {"X": X_small, "y": y_small}, "reg_small": {"X": X_small, "y": y_small_reg}, "multilabel": {"X": X_multilabel, "y": y_multilabel}, "sparse-pos": {"X": X_sparse_pos, "y": y_random}, "sparse-neg": {"X": - X_sparse_pos, "y": y_random}, "sparse-mix": {"X": X_sparse_mix, "y": y_random}, "zeros": {"X": np.zeros((20, 3)), "y": y_random} }
from sklearn.decomposition import TruncatedSVD from sklearn.random_projection import sparse_random_matrix X = sparse_random_matrix(100, 100, density=0.01, random_state=42) print X
''' Created on Apr 25, 2013 @author: Nguyen Huu Hiep ''' import numpy as np import sklearn.random_projection as rp a = np.matrix([[1.0, 2.0], [3.0, 4.0]]) print a print np.linalg.norm(a[:,0]) print np.linalg.norm(a[:,1]) n_components = 2000 n_features = 8192 phi = rp.sparse_random_matrix(n_components, n_features) print type(phi) prod = np.abs(np.dot(np.matrix(phi.T),np.matrix(phi))) print type(prod) print type(prod.max()) #print "max prod_ij =", prod.max().max()