Beispiel #1
0
def test_deprecations():

    with pytest.warns(FutureWarning, match="deprecated in 0.22"):
        gaussian_random_matrix(10, 100)

    with pytest.warns(FutureWarning, match="deprecated in 0.22"):
        sparse_random_matrix(10, 100)
Beispiel #2
0
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    pipeline = Pipeline([("imputer", Imputer(missing_values=0)), ("tree", tree.DecisionTreeRegressor(random_state=0))])

    parameters = {"imputer__strategy": ["mean", "median", "most_frequent"], "imputer__axis": [0, 1]}

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)
    Y = sparse_random_matrix(l, 1, density=0.10).toarray()
    gs = grid_search.GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {'imputer__strategy': ["mean", "median", "most_frequent"]}

    X = sparse_random_matrix(100, 100, density=0.10)
    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"]
    }

    X = sparse_random_matrix(100, 100, density=0.10)
    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
    def test_imputation_pipeline_grid_search(self):
        """Test imputation within a pipeline + gridsearch."""
        pipeline = Pipeline([('imputer', Imputer(missing_values=0)),
                             ('tree', tree.DecisionTreeRegressor(random_state=0))])

        parameters = {
            'imputer__strategy': ["mean", "median", "most_frequent"],
            'imputer__axis': [0, 1]
        }

        l = 100
        X = sparse_random_matrix(l, l, density=0.10)
        Y = sparse_random_matrix(l, 1, density=0.10).toarray()
        gs = grid_search.GridSearchCV(pipeline, parameters)
        gs.fit(X, Y)
def test_imputation_pipeline_grid_search():
    """Test imputation within a pipeline + gridsearch."""
    pipeline = Pipeline([('imputer', Imputer(missing_values=0)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"],
        'imputer__axis': [0, 1]
    }

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)
    Y = sparse_random_matrix(l, 1, density=0.10).todense()
    gs = grid_search.GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
def test_mice_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by MICEImputer

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          n_nearest_features=5,
                          min_value=0,
                          max_value=1,
                          verbose=False,
                          imputation_order=imputation_order,
                          random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)
Beispiel #8
0
def test_iterative_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    max_iter = 2
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by IterativeImputer

    imputer = IterativeImputer(missing_values=0,
                               max_iter=max_iter,
                               n_nearest_features=5,
                               sample_posterior=False,
                               min_value=0,
                               max_value=1,
                               verbose=1,
                               imputation_order=imputation_order,
                               random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]

    assert (len(ordered_idx) //
            imputer.n_iter_ == imputer.n_features_with_missing_)

    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)
Beispiel #9
0
def check_alternative_lrap_implementation(lrap_score,
                                          n_classes=5,
                                          n_samples=20,
                                          random_state=0):
    _, y_true = make_multilabel_classification(n_features=1,
                                               allow_unlabeled=False,
                                               random_state=random_state,
                                               n_classes=n_classes,
                                               n_samples=n_samples)

    # Score with ties
    y_score = sparse_random_matrix(n_components=y_true.shape[0],
                                   n_features=y_true.shape[1],
                                   random_state=random_state)

    if hasattr(y_score, "toarray"):
        y_score = y_score.toarray()
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)

    # Uniform score
    random_state = check_random_state(random_state)
    y_score = random_state.uniform(size=(n_samples, n_classes))
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)
def test3(rnd_stream):
    test_name = 'test3'

    print('##########################################################')
    print("Starting test '{0}'".format(test_name))

    t3_sigma = 0.1
    t3_ntiles = 10
    t3_ntilings = 500

    phi = TileCoding(input_indices=[[0, 1, 2]],
                     ntiles=[t3_ntiles],
                     ntilings=[t3_ntilings],
                     state_range=[[0] * 3, [1.] * 3],
                     rnd_stream=np.random,
                     bias_term=False,
                     hashing=None)

    X, y, f, test_x = create_test_data3(rnd_stream)
    random_proj = rp.sparse_random_matrix(100, phi.size)
    gps = generate_rp_tilecoding_gps(X,
                                     y,
                                     random_proj=random_proj,
                                     sigma=t3_sigma,
                                     tilecoding=phi,
                                     include_sparse=False)

    fit_gps(test_name, gps)

    errors = compare_nll(gps)
    errors = errors + compare_mean_var(test_x, gps)

    print("Ending test '{0}' with {1} errors".format(test_name, errors))
    return errors
Beispiel #11
0
    def fit(self, X, y):
        if self.activation is None:
            # Useful to quantify the impact of the non-linearity
            self._activate = lambda x: x
        else:
            self._activate = self.activations[self.activation]
        rng = check_random_state(self.random_state)

        # one-of-K coding for output values
        self.classes_ = unique_labels(y)
        Y = label_binarize(y, self.classes_)

        # set hidden layer parameters randomly
        n_features = X.shape[1]
        if self.rank is None:
            if self.density == 1:
                self.weights_ = rng.randn(n_features, self.n_hidden)
            else:
                self.weights_ = sparse_random_matrix(self.n_hidden,
                                                     n_features,
                                                     density=self.density,
                                                     random_state=rng).T
        else:
            # Low rank weight matrix
            self.weights_u_ = rng.randn(n_features, self.rank)
            self.weights_v_ = rng.randn(self.rank, self.n_hidden)
        self.biases_ = rng.randn(self.n_hidden)

        # map the input data through the hidden layer
        H = self.transform(X)

        # fit the linear model on the hidden layer activation
        self.beta_ = np.dot(pinv2(H), Y)
        return self
def test_mice_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by MICEImputer

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          n_nearest_features=5,
                          min_value=0,
                          max_value=1,
                          verbose=False,
                          imputation_order=imputation_order,
                          random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d-1]
        ordered_idx_round_2 = ordered_idx[d-1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)
Beispiel #13
0
def test_iterative_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    max_iter = 2
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by IterativeImputer

    imputer = IterativeImputer(missing_values=0,
                               max_iter=max_iter,
                               n_nearest_features=5,
                               sample_posterior=False,
                               min_value=0,
                               max_value=1,
                               verbose=1,
                               imputation_order=imputation_order,
                               random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]

    assert (len(ordered_idx) // imputer.n_iter_ ==
            imputer.n_features_with_missing_)

    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d-1]
        ordered_idx_round_2 = ordered_idx[d-1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)
Beispiel #14
0
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
                                          n_samples=20, random_state=0):
    _, y_true = make_multilabel_classification(n_features=1,
                                               allow_unlabeled=False,
                                               random_state=random_state,
                                               n_classes=n_classes,
                                               n_samples=n_samples)

    # Score with ties
    y_score = sparse_random_matrix(n_components=y_true.shape[0],
                                   n_features=y_true.shape[1],
                                   random_state=random_state)

    if hasattr(y_score, "toarray"):
        y_score = y_score.toarray()
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)

    # Uniform score
    random_state = check_random_state(random_state)
    y_score = random_state.uniform(size=(n_samples, n_classes))
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)
Beispiel #15
0
def test_mice_imputation_order():
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10).toarray()
    X[:, 0] = 1  # this column shouldn't be ever used

    for imputation_order in [
            'random', 'roman', 'monotone', 'revmonotone', 'arabic'
    ]:
        imputer = MICEImputer(missing_values=0,
                              n_imputations=1,
                              n_burn_in=1,
                              n_nearest_features=5,
                              min_value=0,
                              max_value=1,
                              verbose=False,
                              imputation_order=imputation_order)
        imputer.fit_transform(X)
        ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
        if imputation_order == 'roman':
            assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
        elif imputation_order == 'arabic':
            assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
        elif imputation_order == 'random':
            ordered_idx_round_1 = ordered_idx[:d - 1]
            ordered_idx_round_2 = ordered_idx[d - 1:]
            assert ordered_idx_round_1 != ordered_idx_round_2
def test_as_float_array():
    # Test function for as_float_array
    X = np.ones((3, 10), dtype=np.int32)
    X = X + np.arange(10, dtype=np.int32)
    # Checks that the return type is ok
    X2 = as_float_array(X, copy=False)
    np.testing.assert_equal(X2.dtype, np.float32)
    # Another test
    X = X.astype(np.int64)
    X2 = as_float_array(X, copy=True)
    # Checking that the array wasn't overwritten
    assert_true(as_float_array(X, False) is not X)
    # Checking that the new type is ok
    np.testing.assert_equal(X2.dtype, np.float64)
    # Here, X is of the right type, it shouldn't be modified
    X = np.ones((3, 2), dtype=np.float32)
    assert_true(as_float_array(X, copy=False) is X)
    # Test that if X is fortran ordered it stays
    X = np.asfortranarray(X)
    assert_true(np.isfortran(as_float_array(X, copy=True)))

    # Test the copy parameter with some matrices
    matrices = [
        np.matrix(np.arange(5)),
        sp.csc_matrix(np.arange(5)).toarray(),
        sparse_random_matrix(10, 10, density=0.10).toarray()
    ]
    for M in matrices:
        N = as_float_array(M, copy=True)
        N[0, 0] = np.nan
        assert_false(np.isnan(M).any())
Beispiel #17
0
    def fit(self, X, y):
        if self.activation is None:
            # Useful to quantify the impact of the non-linearity
            self._activate = lambda x: x
        else:
            self._activate = self.activations[self.activation]
        rng = check_random_state(self.random_state)

        # one-of-K coding for output values
        self.classes_ = unique_labels(y)
        Y = label_binarize(y, self.classes_)

        # set hidden layer parameters randomly
        n_features = X.shape[1]
        if self.rank is None:
            if self.density == 1:
                self.weights_ = rng.randn(n_features, self.n_hidden)
            else:
                self.weights_ = sparse_random_matrix(
                    self.n_hidden, n_features, density=self.density,
                    random_state=rng).T
        else:
            # Low rank weight matrix
            self.weights_u_ = rng.randn(n_features, self.rank)
            self.weights_v_ = rng.randn(self.rank, self.n_hidden)
        self.biases_ = rng.randn(self.n_hidden)

        # map the input data through the hidden layer
        H = self.transform(X)

        # fit the linear model on the hidden layer activation
        self.beta_ = np.dot(pinv2(H), Y)
        return self
 def __init__(self, n_features=784, n_components=20, random_state=None):
     super().__init__()
     self.n_features = n_features
     self.n_components = n_components
     self.decode = torch.Tensor(
         sparse_random_matrix(self.n_components,
                              self.n_features,
                              random_state=random_state).todense()).float()
Beispiel #19
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_true(np.all(X == Xt))

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=0 => no copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=1 => copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=1, missing_values=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    assert_false(sparse.issparse(Xt))
Beispiel #20
0
 def test_calc_k_mean_sparse(self):
     n_samples = 1000
     n_features = 30
     n_components = 20
     data = sparse_random_matrix(n_samples, n_features, density=0.01, random_state=42)
     actual_output = calc_k_mean(data, n_clusters=n_components, sparse=True)
     self.assertEqual(actual_output[0].shape, (n_samples,))
     self.assertEqual(actual_output[1].shape, (n_components, n_features))
     print(actual_output[1])
Beispiel #21
0
def test_deprecated_imputer_axis():
    depr_message = ("Parameter 'axis' has been deprecated in 0.20 and will "
                    "be removed in 0.22. Future (and default) behavior is "
                    "equivalent to 'axis=0' (impute along columns). Row-wise "
                    "imputation can be performed with FunctionTransformer.")
    X = sparse_random_matrix(5, 5, density=0.75, random_state=0)
    imputer = Imputer(missing_values=0, axis=0)
    assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
    imputer = Imputer(missing_values=0, axis=1)
    assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
Beispiel #22
0
def test_deprecated_imputer_axis():
    depr_message = ("Parameter 'axis' has been deprecated in 0.20 and will "
                    "be removed in 0.22. Future (and default) behavior is "
                    "equivalent to 'axis=0' (impute along columns). Row-wise "
                    "imputation can be performed with FunctionTransformer.")
    X = sparse_random_matrix(5, 5, density=0.75, random_state=0)
    imputer = Imputer(missing_values=0, axis=0)
    assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
    imputer = Imputer(missing_values=0, axis=1)
    assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
Beispiel #23
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
Beispiel #24
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
    def get_tilegp(self):
        nlayers = self.options['nlayers']
        indices = []
        ntiles = []
        hashing = None
        all_cat = np.unique(self.z)
        if self.tilecap:
            hashing_mem = self.tilecap / len(all_cat) / nlayers
            hashing = [rp.UNH(hashing_mem) for _ in xrange(len(all_cat))]
        for a in all_cat:
            inds = helper.find(self.z == a)
            indices.append(inds)
            ntiles.append(self.k[inds])

        phi = TileCoding(
            input_indices=indices,
            # ntiles = input dim x number of layers x tilings
            ntiles=ntiles,
            ntilings=[nlayers] * len(indices),
            hashing=hashing,
            state_range=self.x_range,
            rnd_stream=np.random,
            bias_term=False)

        if self.gp_type == 'sk':
            # densekern > sparsekern \approx sparserp
            sparsekern = SparseKernel(phi, normalize=True)
            gp = SparseKernelGP(self.X, self.y, sigma=0.1, kern=sparsekern)
        elif self.gp_type == 'sf':
            # too slow
            sparsephi = IndexToBinarySparse(phi, normalize=True)
            gp = SparseFeatureGP(self.X, self.y, sigma=0.1, phi=sparsephi)

        elif self.gp_type == 'dk':
            densekern = DenseKernel(phi, normalize=True)
            gp = DenseKernelGP(self.X,
                               self.y,
                               sigma=self.sigma,
                               kern=densekern)
        else:
            random_proj = rp.sparse_random_matrix(300,
                                                  phi.size,
                                                  random_state=np.random)
            densephi = SparseRPTilecoding(phi,
                                          random_proj=random_proj,
                                          normalize=True,
                                          output_dense=True)
            gp = DenseFeatureGP(self.X, self.y, sigma=self.sigma, phi=densephi)

        gp.fit()
        return gp
Beispiel #26
0
def test_mice_predictors():
    from sklearn.dummy import DummyRegressor
    from sklearn.linear_model import BayesianRidge

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10).toarray()

    for predictor in [DummyRegressor, BayesianRidge]:
        imputer = MICEImputer(missing_values=0,
                              n_imputations=1,
                              n_burn_in=1,
                              predictor=predictor())
        imputer.fit_transform(X)
Beispiel #27
0
def main():
    """
    Create low-dimensional and sparse random matrix from vocabulary file.
    """

    # Get the arguments
    args = docopt(
        '''Create low-dimensional and sparse random matrix from vocabulary file.

    Usage:
        random.py <vocabFile> <outPath> <dim>

        <vocabFile> = row and column vocabulary
        <outPath> = output path for random matrix
        <dim> = dimensionality for random vectors

    Note:
        Calculates number of seeds automatically as proposed in [1,2]

    References:
        [1] Ping Li, T. Hastie and K. W. Church, 2006,
           "Very Sparse Random Projections".
           http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf
        [2] D. Achlioptas, 2001, "Database-friendly random projections",
           http://www.cs.ucsc.edu/~optas/papers/jl.pdf

    ''')
    #np.random.seed(0) # uncomment for reproducibility

    vocabFile = args['<vocabFile>']
    outPath = args['<outPath>']
    dim = int(args['<dim>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load vocabulary
    logging.info("Loading vocabulary")
    with open(vocabFile, 'r', encoding='utf-8') as f_in:
        vocabulary = [line.strip() for line in f_in]

    # Generate random vectors
    randomMatrix = sparse_random_matrix(dim, len(vocabulary)).toarray().T

    # Store random matrix
    Space(matrix=randomMatrix, rows=vocabulary, columns=[]).save(outPath)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Beispiel #28
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_array_almost_equal(X, Xt)

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_array_almost_equal(X.data, Xt.data)

    # copy=False, sparse csc => no copy
    X = X_orig.copy().tocsc()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_array_almost_equal(X.data, Xt.data)

    # copy=False, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))
def test_sparse_random_matrix():
    """Check some statical properties of sparse random matrix"""
    n_components = 100
    n_features = 500

    for density in [0.3, 1.]:
        s = 1 / density

        A = sparse_random_matrix(n_components,
                                 n_features,
                                 density=density,
                                 random_state=0)
        A = densify(A)

        # Check possible values
        values = np.unique(A)
        assert_in(np.sqrt(s) / np.sqrt(n_components), values)
        assert_in(-np.sqrt(s) / np.sqrt(n_components), values)

        if density == 1.0:
            assert_equal(np.size(values), 2)
        else:
            assert_in(0., values)
            assert_equal(np.size(values), 3)

        # Check that the random matrix follow the proper distribution.
        # Let's say that each element of a_{ij} of A is taken from
        #
        # - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        # -  0                              with probability 1 - 1 / s
        # - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        #
        assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
        assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s),
                            decimal=2)
        assert_almost_equal(np.mean(A == -np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s),
                            decimal=2)

        assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s,
                            decimal=2)
        assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s),
                            decimal=2)
        assert_almost_equal(np.var(A == -np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s),
                            decimal=2)
Beispiel #30
0
def test_iterative_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               min_value=0.1,
                               max_value=0.2,
                               random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #31
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)
Beispiel #32
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)
def test_sparse_random_matrix():
    # Check some statical properties of sparse random matrix
    n_components = 100
    n_features = 500

    for density in [0.3, 1.]:
        s = 1 / density

        A = sparse_random_matrix(n_components,
                                 n_features,
                                 density=density,
                                 random_state=0)
        A = densify(A)

        # Check possible values
        values = np.unique(A)
        assert_in(np.sqrt(s) / np.sqrt(n_components), values)
        assert_in(- np.sqrt(s) / np.sqrt(n_components), values)

        if density == 1.0:
            assert_equal(np.size(values), 2)
        else:
            assert_in(0., values)
            assert_equal(np.size(values), 3)

        # Check that the random matrix follow the proper distribution.
        # Let's say that each element of a_{ij} of A is taken from
        #
        # - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        # -  0                              with probability 1 - 1 / s
        # - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        #
        assert_almost_equal(np.mean(A == 0.0),
                            1 - 1 / s, decimal=2)
        assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s), decimal=2)
        assert_almost_equal(np.mean(A == - np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s), decimal=2)

        assert_almost_equal(np.var(A == 0.0, ddof=1),
                            (1 - 1 / s) * 1 / s, decimal=2)
        assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1),
                            (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
        assert_almost_equal(np.var(A == - np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1),
                            (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
def test_imputation_pickle():
    """Test for pickling imputers."""
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(imputer.transform(X.copy()),
                           imputer_pickled.transform(X.copy()),
                           "Fail to transform the data after pickling "
                           "(strategy = %s)" % (strategy))
def test_mice_transform_stochasticity():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          random_state=rng)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
Beispiel #36
0
def test_iterative_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               min_value=0.1,
                               max_value=0.2,
                               random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #37
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(
            imputer.transform(X.copy()), imputer_pickled.transform(X.copy()),
            "Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))
def test_mice_transform_stochasticity():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          random_state=rng)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))
Beispiel #39
0
def test_as_float_array():
    # Test function for as_float_array
    X = np.ones((3, 10), dtype=np.int32)
    X = X + np.arange(10, dtype=np.int32)
    X2 = as_float_array(X, copy=False)
    assert_equal(X2.dtype, np.float32)
    # Another test
    X = X.astype(np.int64)
    X2 = as_float_array(X, copy=True)
    # Checking that the array wasn't overwritten
    assert as_float_array(X, False) is not X
    assert_equal(X2.dtype, np.float64)
    # Test int dtypes <= 32bit
    tested_dtypes = [
        np.bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32
    ]
    for dtype in tested_dtypes:
        X = X.astype(dtype)
        X2 = as_float_array(X)
        assert_equal(X2.dtype, np.float32)

    # Test object dtype
    X = X.astype(object)
    X2 = as_float_array(X, copy=True)
    assert_equal(X2.dtype, np.float64)

    # Here, X is of the right type, it shouldn't be modified
    X = np.ones((3, 2), dtype=np.float32)
    assert as_float_array(X, copy=False) is X
    # Test that if X is fortran ordered it stays
    X = np.asfortranarray(X)
    assert np.isfortran(as_float_array(X, copy=True))

    # Test the copy parameter with some matrices
    matrices = [
        np.matrix(np.arange(5)),
        sp.csc_matrix(np.arange(5)).toarray(),
        sparse_random_matrix(10, 10, density=0.10).toarray()
    ]
    for M in matrices:
        N = as_float_array(M, copy=True)
        N[0, 0] = np.nan
        assert not np.isnan(M).any()
Beispiel #40
0
def test_mice_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    pipeline = Pipeline([('imputer',
                          MICEImputer(missing_values=0,
                                      n_imputations=1,
                                      n_burn_in=1,
                                      random_state=0)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__initial_strategy': ["mean", "median", "most_frequent"]
    }

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.50).toarray()
    Y = np.random.random((n, d))
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
def test_as_float_array():
    # Test function for as_float_array
    X = np.ones((3, 10), dtype=np.int32)
    X = X + np.arange(10, dtype=np.int32)
    X2 = as_float_array(X, copy=False)
    assert_equal(X2.dtype, np.float32)
    # Another test
    X = X.astype(np.int64)
    X2 = as_float_array(X, copy=True)
    # Checking that the array wasn't overwritten
    assert as_float_array(X, False) is not X
    assert_equal(X2.dtype, np.float64)
    # Test int dtypes <= 32bit
    tested_dtypes = [np.bool,
                     np.int8, np.int16, np.int32,
                     np.uint8, np.uint16, np.uint32]
    for dtype in tested_dtypes:
        X = X.astype(dtype)
        X2 = as_float_array(X)
        assert_equal(X2.dtype, np.float32)

    # Test object dtype
    X = X.astype(object)
    X2 = as_float_array(X, copy=True)
    assert_equal(X2.dtype, np.float64)

    # Here, X is of the right type, it shouldn't be modified
    X = np.ones((3, 2), dtype=np.float32)
    assert as_float_array(X, copy=False) is X
    # Test that if X is fortran ordered it stays
    X = np.asfortranarray(X)
    assert np.isfortran(as_float_array(X, copy=True))

    # Test the copy parameter with some matrices
    matrices = [
        np.matrix(np.arange(5)),
        sp.csc_matrix(np.arange(5)).toarray(),
        sparse_random_matrix(10, 10, density=0.10).toarray()
    ]
    for M in matrices:
        N = as_float_array(M, copy=True)
        N[0, 0] = np.nan
        assert not np.isnan(M).any()
Beispiel #42
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(missing_values=0,
                               max_iter=2,
                               n_nearest_features=5,
                               sample_posterior=True,
                               min_value=0.1,
                               max_value=0.2,
                               verbose=1,
                               imputation_order='random',
                               random_state=rng)
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #43
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(missing_values=0,
                               max_iter=2,
                               n_nearest_features=5,
                               sample_posterior=True,
                               min_value=0.1,
                               max_value=0.2,
                               verbose=1,
                               imputation_order='random',
                               random_state=rng)
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #44
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert not np.all(X == Xt)

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_array_almost_equal(X, Xt)

    # copy=False, sparse csc => no copy
    X = X_orig.copy().tocsc()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_array_almost_equal(X.data, Xt.data)

    # copy=False, sparse csr => copy
    X = X_orig.copy()
    imputer = SimpleImputer(missing_values=X.data[0], strategy="mean",
                            copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert not np.all(X.data == Xt.data)
def test_mice_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          predictor=predictor,
                          random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for predictors
    hashes = []
    for triplet in imputer.imputation_sequence_:
        assert triplet.predictor
        hashes.append(id(triplet.predictor))

    # check that each predictor is unique
    assert len(set(hashes)) == len(hashes)
def test_mice_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          predictor=predictor,
                          random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for predictors
    hashes = []
    for triplet in imputer.imputation_sequence_:
        assert triplet.predictor
        hashes.append(id(triplet.predictor))

    # check that each predictor is unique
    assert len(set(hashes)) == len(hashes)
Beispiel #47
0
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for estimators
    hashes = []
    for triplet in imputer.imputation_sequence_:
        expected_type = (type(estimator) if estimator is not None
                         else type(BayesianRidge()))
        assert isinstance(triplet.estimator, expected_type)
        hashes.append(id(triplet.estimator))

    # check that each estimator is unique
    assert len(set(hashes)) == len(hashes)
Beispiel #48
0
    def test_calc_pca_time(self):
        n_samples = 10000
        n_features = 1000
        n_components = 20
        num_iter = 10
        data = sparse_random_matrix(n_samples, n_features, density=0.01, random_state=42)
        data = data.toarray()
        start_time = time.time()
        for _ in range(num_iter):
            actual_output = calc_sparse_pca(data, n_components=n_components)
        end_time = time.time()
        sparse_pca_elapsed_time = (end_time-start_time) / num_iter

        print("Elapsed time for calc_sparse_pca is %f" %(sparse_pca_elapsed_time))

        start_time = time.time()
        for _ in range(num_iter):
            actual_output = calc_pca(data, n_components=n_components)
        end_time = time.time()
        pca_elapsed_time = (end_time-start_time) / num_iter

        print("Elapsed time for calc_pca is %f" %(pca_elapsed_time))
        self.assertGreater(pca_elapsed_time, sparse_pca_elapsed_time)
Beispiel #49
0
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for estimators
    hashes = []
    for triplet in imputer.imputation_sequence_:
        expected_type = (type(estimator)
                         if estimator is not None else type(BayesianRidge()))
        assert isinstance(triplet.estimator, expected_type)
        hashes.append(id(triplet.estimator))

    # check that each estimator is unique
    assert len(set(hashes)) == len(hashes)
Beispiel #50
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    n = 100
    X = sparse_random_matrix(n, n, density=0.10).todense()

    for strategy in ["mean", "median", "most_frequent", "mice"]:
        if strategy == 'mice':
            imputer = MICEImputer(missing_values=0,
                                  n_imputations=1,
                                  n_burn_in=1)
        else:
            imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_almost_equal(
            imputer.transform(X.copy()),
            imputer_pickled.transform(X.copy()),
            err_msg="Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))
Beispiel #51
0
def test_imputation_copy():
    """Test imputation with copy=True."""
    l = 5

    # Test default behaviour and with copy=True
    for params in [{}, {'copy': True}]:
        X = sparse_random_matrix(l, l, density=0.75, random_state=0)

        # Dense
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X.todense() == Xt))

        # Sparse
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        X = X.todense()
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X == Xt))
Beispiel #52
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(imputer.transform(X) ==
                      imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X),
                    imputer.initial_imputer_.transform(X))
Beispiel #53
0
boston.data = boston.data[perm]
boston.target = boston.target[perm]

digits = datasets.load_digits()
perm = rng.permutation(digits.target.size)
digits.data = digits.data[perm]
digits.target = digits.target[perm]

random_state = check_random_state(0)
X_multilabel, y_multilabel = datasets.make_multilabel_classification(
    random_state=0, n_samples=30, n_features=10)

X_sparse_pos = random_state.uniform(size=(20, 5))
X_sparse_pos[X_sparse_pos <= 0.8] = 0.
y_random = random_state.randint(0, 4, size=(20, ))
X_sparse_mix = sparse_random_matrix(20, 10, density=0.25, random_state=0)


DATASETS = {
    "iris": {"X": iris.data, "y": iris.target},
    "boston": {"X": boston.data, "y": boston.target},
    "digits": {"X": digits.data, "y": digits.target},
    "toy": {"X": X, "y": y},
    "clf_small": {"X": X_small, "y": y_small},
    "reg_small": {"X": X_small, "y": y_small_reg},
    "multilabel": {"X": X_multilabel, "y": y_multilabel},
    "sparse-pos": {"X": X_sparse_pos, "y": y_random},
    "sparse-neg": {"X": - X_sparse_pos, "y": y_random},
    "sparse-mix": {"X": X_sparse_mix, "y": y_random},
    "zeros": {"X": np.zeros((20, 3)), "y": y_random}
}
Beispiel #54
0
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
X = sparse_random_matrix(100, 100, density=0.01, random_state=42)
print X
Beispiel #55
0
'''
Created on Apr 25, 2013

@author: Nguyen Huu Hiep
'''

import numpy as np
import sklearn.random_projection as rp

a = np.matrix([[1.0, 2.0], [3.0, 4.0]])
print a
print np.linalg.norm(a[:,0])
print np.linalg.norm(a[:,1])


n_components = 2000
n_features = 8192
phi = rp.sparse_random_matrix(n_components, n_features)

print type(phi)

prod = np.abs(np.dot(np.matrix(phi.T),np.matrix(phi)))

print type(prod)
print type(prod.max())

#print "max prod_ij =", prod.max().max()