def test_safe_indexing_1d_container_mask(array_type, indices_type): indices = [False] + [True] * 2 + [False] * 6 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_check_array_force_all_finite_valid(value, force_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(np.float)) X[0, 0] = value X_checked = check_array(X, force_all_finite=force_all_finite, accept_sparse=True) assert_allclose_dense_sparse(X, X_checked)
def test_safe_indexing_1d_container(array_type, indices_type): indices = [1, 2] if indices_type == 'slice' and isinstance(indices[1], int): indices[1] += 1 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=0) assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_tfidf_transformer_sparse(): X = sparse.rand(10, 20000, dtype=np.float64, random_state=42) X_csc = sparse.csc_matrix(X) X_csr = sparse.csr_matrix(X) X_trans_csc = TfidfTransformer().fit_transform(X_csc) X_trans_csr = TfidfTransformer().fit_transform(X_csr) assert_allclose_dense_sparse(X_trans_csc, X_trans_csr) assert X_trans_csc.format == X_trans_csr.format
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset): columns_name = ['col_0', 'col_1', 'col_2'] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) indices = [False, True, True] indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=axis) assert_allclose_dense_sparse( subset, _convert_container(expected_subset, array_type))
def test_safe_indexing_2d_read_only_axis_1(array_read_only, indices_read_only, array_type, indices_type, axis, expected_array): array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) if array_read_only: array.setflags(write=False) array = _convert_container(array, array_type) indices = np.array([1, 2]) if indices_read_only: indices.setflags(write=False) indices = _convert_container(indices, indices_type) subset = _safe_indexing(array, indices, axis=axis) assert_allclose_dense_sparse( subset, _convert_container(expected_array, array_type))
def test_safe_sparse_dot_dense_output(dense_output): rng = np.random.RandomState(0) A = sparse.random(30, 10, density=0.1, random_state=rng) B = sparse.random(10, 20, density=0.1, random_state=rng) expected = A.dot(B) actual = safe_sparse_dot(A, B, dense_output=dense_output) assert sparse.issparse(actual) == (not dense_output) if dense_output: expected = expected.toarray() assert_allclose_dense_sparse(actual, expected)
def test_20news_normalization(): try: X = datasets.fetch_20newsgroups_vectorized(normalize=False, download_if_missing=False) X_ = datasets.fetch_20newsgroups_vectorized(normalize=True, download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") X_norm = X_['data'][:100] X = X['data'][:100] assert_allclose_dense_sparse(X_norm, normalize(X)) assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
def test_incremental_pca_batch_rank(): # Test sample size in each batch is always larger or equal to n_components rng = np.random.RandomState(1999) n_samples = 100 n_features = 20 X = rng.randn(n_samples, n_features) all_components = [] batch_sizes = np.arange(20, 90, 3) for batch_size in batch_sizes: ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X) all_components.append(ipca.components_) for components_i, components_j in zip(all_components[:-1], all_components[1:]): assert_allclose_dense_sparse(components_i, components_j)
def test_stacking_classifier_sparse_passthrough(fmt): # Check passthrough behavior on a sparse X matrix X_train, X_test, y_train, _ = train_test_split(sparse.coo_matrix( scale(X_iris)).asformat(fmt), y_iris, random_state=42) estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())] rf = RandomForestClassifier(n_estimators=10, random_state=42) clf = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5, passthrough=True) clf.fit(X_train, y_train) X_trans = clf.transform(X_test) assert_allclose_dense_sparse(X_test, X_trans[:, -4:]) assert sparse.issparse(X_trans) assert X_test.format == X_trans.format
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices): columns_name = ['col_0', 'col_1', 'col_2'] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) if isinstance(indices, str) and array_type != 'dataframe': err_msg = ("Specifying the columns using strings is only supported " "for pandas DataFrames") with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices, axis=1) else: subset = _safe_indexing(array, indices, axis=1) expected_output = [3, 6, 9] if expected_output_type == 'sparse': # sparse matrix are keeping the 2D shape expected_output = [[3], [6], [9]] expected_array = _convert_container(expected_output, expected_output_type) assert_allclose_dense_sparse(subset, expected_array)
def test_check_dataframe_mixed_float_dtypes(): # pandas dataframe will coerce a boolean into a object, this is a mismatch # with np.result_type which will return a float # check_array needs to explicitly check for bool dtype in a dataframe for # this situation # https://github.com/scikit-learn/scikit-learn/issues/15787 pd = importorskip("pandas") df = pd.DataFrame( { 'int': [1, 2, 3], 'float': [0, 0.1, 2.1], 'bool': [True, False, True] }, columns=['int', 'float', 'bool']) array = check_array(df, dtype=(np.float64, np.float32, np.float16)) expected_array = np.array( [[1.0, 0.0, 1.0], [2.0, 0.1, 0.0], [3.0, 2.1, 1.0]], dtype=np.float) assert_allclose_dense_sparse(array, expected_array)
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices): # validation of the indices # we make a copy because indices is mutable and shared between tests indices_converted = copy(indices) if indices_type == 'slice' and isinstance(indices[1], int): indices_converted[1] += 1 columns_name = ['col_0', 'col_1', 'col_2'] array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name) indices_converted = _convert_container(indices_converted, indices_type) if isinstance(indices[0], str) and array_type != 'dataframe': err_msg = ("Specifying the columns using strings is only supported " "for pandas DataFrames") with pytest.raises(ValueError, match=err_msg): _safe_indexing(array, indices_converted, axis=1) else: subset = _safe_indexing(array, indices_converted, axis=1) assert_allclose_dense_sparse( subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type))
def test_check_inverse(): X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)] for X in X_list: if sparse.issparse(X): accept_sparse = True else: accept_sparse = False trans = FunctionTransformer(func=np.sqrt, inverse_func=np.around, accept_sparse=accept_sparse, check_inverse=True, validate=True) assert_warns_message(UserWarning, "The provided functions are not strictly" " inverse of each other. If you are sure you" " want to proceed regardless, set" " 'check_inverse=False'.", trans.fit, X) trans = FunctionTransformer(func=np.expm1, inverse_func=np.log1p, accept_sparse=accept_sparse, check_inverse=True, validate=True) Xt = assert_no_warnings(trans.fit_transform, X) assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) # check that we don't check inverse when one of the func or inverse is not # provided. trans = FunctionTransformer(func=np.expm1, inverse_func=None, check_inverse=True, validate=True) assert_no_warnings(trans.fit, X_dense) trans = FunctionTransformer(func=None, inverse_func=np.expm1, check_inverse=True, validate=True) assert_no_warnings(trans.fit, X_dense)
def test_imputers_add_indicator_sparse(imputer, marker): X = sparse.csr_matrix([ [marker, 1, 5, marker, 1], [2, marker, 1, marker, 2], [6, 3, marker, marker, 3], [1, 2, 9, marker, 4] ]) X_true_indicator = sparse.csr_matrix([ [1., 0., 0., 1.], [0., 1., 0., 1.], [0., 0., 1., 1.], [0., 0., 0., 1.] ]) imputer.set_params(missing_values=marker, add_indicator=True) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator) assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3])) imputer.set_params(add_indicator=False) X_trans_no_indicator = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
def test_check_fit_params(indices): X = np.random.randn(4, 2) fit_params = { 'list': [1, 2, 3, 4], 'array': np.array([1, 2, 3, 4]), 'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T, 'sparse-row': sp.csc_matrix([1, 2, 3, 4]), 'scalar-int': 1, 'scalar-str': 'xxx', 'None': None, } result = _check_fit_params(X, fit_params, indices) indices_ = indices if indices is not None else list(range(X.shape[0])) for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']: assert result[key] is fit_params[key] assert result['list'] == _safe_indexing(fit_params['list'], indices_) assert_array_equal(result['array'], _safe_indexing(fit_params['array'], indices_)) assert_allclose_dense_sparse( result['sparse-col'], _safe_indexing(fit_params['sparse-col'], indices_))
def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([ [np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan], [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan] ]) X_true = np.array([ [-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1], [1.4, 1.5, 0, -1] ]) X = array_constructor(X) X_true = array_constructor(X_true) imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true)
def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse( ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse( ct.fit(X_sparse).transform(X_sparse), X_res_both)
def test_as_float_array_nan(X): X[5, 0] = np.nan X[6, 1] = np.nan X_converted = as_float_array(X, force_all_finite='allow-nan') assert_allclose_dense_sparse(X_converted, X)
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type): array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) indices = 2 subset = _safe_indexing(array, indices, axis=0) expected_array = _convert_container([7, 8, 9], expected_output_type) assert_allclose_dense_sparse(subset, expected_array)
def test_safe_indexing_None_axis_0(array_type): X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type) X_subset = _safe_indexing(X, None, axis=0) assert_allclose_dense_sparse(X_subset, X)