Beispiel #1
0
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
    # validation of the indices
    # we make a copy because indices is mutable and shared between tests
    indices_converted = copy(indices)
    if indices_type == "slice" and isinstance(indices[1], int):
        indices_converted[1] += 1

    columns_name = ["col_0", "col_1", "col_2"]
    array = _convert_container(
        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
    )
    indices_converted = _convert_container(indices_converted, indices_type)

    if isinstance(indices[0], str) and array_type != "dataframe":
        err_msg = (
            "Specifying the columns using strings is only supported "
            "for pandas DataFrames"
        )
        with pytest.raises(ValueError, match=err_msg):
            _safe_indexing(array, indices_converted, axis=1)
    else:
        subset = _safe_indexing(array, indices_converted, axis=1)
        assert_allclose_dense_sparse(
            subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
        )
Beispiel #2
0
def test_safe_indexing_1d_container_mask(array_type, indices_type):
    indices = [False] + [True] * 2 + [False] * 6
    array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
    indices = _convert_container(indices, indices_type)
    subset = _safe_indexing(array, indices, axis=0)
    assert_allclose_dense_sparse(subset, _convert_container([2, 3],
                                                            array_type))
Beispiel #3
0
def test_isomap_simple_grid(n_neighbors, radius):
    # Isomap should preserve distances when all neighbors are used
    n_pts = 25
    X = create_sample_data(n_pts=n_pts, add_noise=False)

    # distances from each point to all others
    if n_neighbors is not None:
        G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
    else:
        G = neighbors.radius_neighbors_graph(X, radius, mode="distance")

    for eigen_solver in eigen_solvers:
        for path_method in path_methods:
            clf = manifold.Isomap(
                n_neighbors=n_neighbors,
                radius=radius,
                n_components=2,
                eigen_solver=eigen_solver,
                path_method=path_method,
            )
            clf.fit(X)

            if n_neighbors is not None:
                G_iso = neighbors.kneighbors_graph(clf.embedding_,
                                                   n_neighbors,
                                                   mode="distance")
            else:
                G_iso = neighbors.radius_neighbors_graph(clf.embedding_,
                                                         radius,
                                                         mode="distance")
            assert_allclose_dense_sparse(G, G_iso)
def test_check_array_force_all_finite_valid(value, force_all_finite, retype):
    X = retype(np.arange(4).reshape(2, 2).astype(np.float))
    X[0, 0] = value
    X_checked = check_array(X,
                            force_all_finite=force_all_finite,
                            accept_sparse=True)
    assert_allclose_dense_sparse(X, X_checked)
def test_check_fit_params(indices):
    X = np.random.randn(4, 2)
    fit_params = {
        'list': [1, 2, 3, 4],
        'array': np.array([1, 2, 3, 4]),
        'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T,
        'sparse-row': sp.csc_matrix([1, 2, 3, 4]),
        'scalar-int': 1,
        'scalar-str': 'xxx',
        'None': None,
    }
    result = _check_fit_params(X, fit_params, indices)
    indices_ = indices if indices is not None else list(range(X.shape[0]))

    for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']:
        assert result[key] is fit_params[key]

    assert result['list'] == _safe_indexing(fit_params['list'], indices_)
    assert_array_equal(
        result['array'], _safe_indexing(fit_params['array'], indices_)
    )
    assert_allclose_dense_sparse(
        result['sparse-col'],
        _safe_indexing(fit_params['sparse-col'], indices_)
    )
Beispiel #6
0
def check_pipeline_consistency(name, estimator_orig):
    if estimator_orig._get_tags()['non_deterministic']:
        msg = name + ' is non deterministic'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)
    set_random_state(estimator)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)

    funcs = ["score", "fit_transform"]

    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_allclose_dense_sparse(result, result_pipe)
Beispiel #7
0
def test_imputers_add_indicator_sparse(imputer, marker):
    X = sparse.csr_matrix(
        [
            [marker, 1, 5, marker, 1],
            [2, marker, 1, marker, 2],
            [6, 3, marker, marker, 3],
            [1, 2, 9, marker, 4],
        ]
    )
    X_true_indicator = sparse.csr_matrix(
        [
            [1.0, 0.0, 0.0, 1.0],
            [0.0, 1.0, 0.0, 1.0],
            [0.0, 0.0, 1.0, 1.0],
            [0.0, 0.0, 0.0, 1.0],
        ]
    )
    imputer.set_params(missing_values=marker, add_indicator=True)

    X_trans = imputer.fit_transform(X)
    assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
    assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))

    imputer.set_params(add_indicator=False)
    X_trans_no_indicator = imputer.fit_transform(X)
    assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
Beispiel #8
0
def test_20news_normalization(fetch_20newsgroups_vectorized_fxt):
    X = fetch_20newsgroups_vectorized_fxt(normalize=False)
    X_ = fetch_20newsgroups_vectorized_fxt(normalize=True)
    X_norm = X_['data'][:100]
    X = X['data'][:100]

    assert_allclose_dense_sparse(X_norm, normalize(X))
    assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
Beispiel #9
0
def test_safe_indexing_1d_container(array_type, indices_type):
    indices = [1, 2]
    if indices_type == "slice" and isinstance(indices[1], int):
        indices[1] += 1
    array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
    indices = _convert_container(indices, indices_type)
    subset = _safe_indexing(array, indices, axis=0)
    assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
def test_function_sampler_func(X, y):
    def func(X, y):
        return X[:10], y[:10]

    sampler = FunctionSampler(func=func)
    X_res, y_res = sampler.fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X[:10])
    assert_array_equal(y_res, y[:10])
Beispiel #11
0
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
    indices = [1, 2]
    if indices_type == 'slice' and isinstance(indices[1], int):
        indices[1] += 1
    array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
    indices = _convert_container(indices, indices_type)
    subset = _safe_indexing(array, indices, axis=0)
    assert_allclose_dense_sparse(
        subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type))
Beispiel #12
0
def test_tfidf_transformer_sparse():
    X = sparse.rand(10, 20000, dtype=np.float64, random_state=42)
    X_csc = sparse.csc_matrix(X)
    X_csr = sparse.csr_matrix(X)

    X_trans_csc = TfidfTransformer().fit_transform(X_csc)
    X_trans_csr = TfidfTransformer().fit_transform(X_csr)
    assert_allclose_dense_sparse(X_trans_csc, X_trans_csr)
    assert X_trans_csc.format == X_trans_csr.format
Beispiel #13
0
def test_safe_indexing_2d_mask(array_type, indices_type, axis,
                               expected_subset):
    columns_name = ['col_0', 'col_1', 'col_2']
    array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type,
                               columns_name)
    indices = [False, True, True]
    indices = _convert_container(indices, indices_type)

    subset = _safe_indexing(array, indices, axis=axis)
    assert_allclose_dense_sparse(
        subset, _convert_container(expected_subset, array_type))
Beispiel #14
0
def test_check_inverse():
    X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))

    X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)]

    for X in X_list:
        if sparse.issparse(X):
            accept_sparse = True
        else:
            accept_sparse = False
        trans = FunctionTransformer(
            func=np.sqrt,
            inverse_func=np.around,
            accept_sparse=accept_sparse,
            check_inverse=True,
            validate=True,
        )
        warning_message = (
            "The provided functions are not strictly"
            " inverse of each other. If you are sure you"
            " want to proceed regardless, set"
            " 'check_inverse=False'."
        )
        with pytest.warns(UserWarning, match=warning_message):
            trans.fit(X)

        trans = FunctionTransformer(
            func=np.expm1,
            inverse_func=np.log1p,
            accept_sparse=accept_sparse,
            check_inverse=True,
            validate=True,
        )
        with warnings.catch_warnings():
            warnings.simplefilter("error", UserWarning)
            Xt = trans.fit_transform(X)

        assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))

    # check that we don't check inverse when one of the func or inverse is not
    # provided.
    trans = FunctionTransformer(
        func=np.expm1, inverse_func=None, check_inverse=True, validate=True
    )
    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        trans.fit(X_dense)
    trans = FunctionTransformer(
        func=None, inverse_func=np.expm1, check_inverse=True, validate=True
    )
    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        trans.fit(X_dense)
Beispiel #15
0
def test_safe_indexing_2d_read_only_axis_1(
    array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
):
    array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    if array_read_only:
        array.setflags(write=False)
    array = _convert_container(array, array_type)
    indices = np.array([1, 2])
    if indices_read_only:
        indices.setflags(write=False)
    indices = _convert_container(indices, indices_type)
    subset = _safe_indexing(array, indices, axis=axis)
    assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
Beispiel #16
0
def test_column_transformer_sparse_array():
    X_sparse = sparse.eye(3, 2).tocsr()

    # no distinction between 1D and 2D
    X_res_first = X_sparse[:, 0]
    X_res_both = X_sparse

    for col in [0, [0], slice(0, 1)]:
        for remainder, res in [('drop', X_res_first),
                               ('passthrough', X_res_both)]:
            ct = ColumnTransformer([('trans', Trans(), col)],
                                   remainder=remainder,
                                   sparse_threshold=0.8)
            assert sparse.issparse(ct.fit_transform(X_sparse))
            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                         res)

    for col in [[0, 1], slice(0, 2)]:
        ct = ColumnTransformer([('trans', Trans(), col)],
                               sparse_threshold=0.8)
        assert sparse.issparse(ct.fit_transform(X_sparse))
        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                     X_res_both)
Beispiel #17
0
def test_20news_normalization():
    try:
        X = datasets.fetch_20newsgroups_vectorized(normalize=False,
                                                   download_if_missing=False)
        X_ = datasets.fetch_20newsgroups_vectorized(normalize=True,
                                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    X_norm = X_['data'][:100]
    X = X['data'][:100]

    assert_allclose_dense_sparse(X_norm, normalize(X))
    assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
def test_incremental_pca_batch_rank():
    # Test sample size in each batch is always larger or equal to n_components
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 20
    X = rng.randn(n_samples, n_features)
    all_components = []
    batch_sizes = np.arange(20, 90, 3)
    for batch_size in batch_sizes:
        ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X)
        all_components.append(ipca.components_)

    for components_i, components_j in zip(all_components[:-1], all_components[1:]):
        assert_allclose_dense_sparse(components_i, components_j)
def test_random_projection_numerical_consistency(random_projection_cls):
    # Verify numerical consistency among np.float32 and np.float64
    atol = 1e-5
    rng = np.random.RandomState(42)
    X = rng.rand(25, 3000)
    rp_32 = random_projection_cls(random_state=0)
    rp_64 = random_projection_cls(random_state=0)

    projection_32 = rp_32.fit_transform(X.astype(np.float32))
    projection_64 = rp_64.fit_transform(X.astype(np.float64))

    assert_allclose(projection_64, projection_32, atol=atol)

    assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)
def test_function_sampler_func_kwargs(X, y):
    def func(X, y, sampling_strategy, random_state):
        rus = RandomUnderSampler(
            sampling_strategy=sampling_strategy, random_state=random_state
        )
        return rus.fit_resample(X, y)

    sampler = FunctionSampler(
        func=func, kw_args={"sampling_strategy": "auto", "random_state": 0}
    )
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_2, y_res_2 = RandomUnderSampler(random_state=0).fit_resample(X, y)
    assert_allclose_dense_sparse(X_res, X_res_2)
    assert_array_equal(y_res, y_res_2)
Beispiel #21
0
def test_safe_sparse_dot_dense_output(dense_output):
    rng = np.random.RandomState(0)

    A = sparse.random(30, 10, density=0.1, random_state=rng)
    B = sparse.random(10, 20, density=0.1, random_state=rng)

    expected = A.dot(B)
    actual = safe_sparse_dot(A, B, dense_output=dense_output)

    assert sparse.issparse(actual) == (not dense_output)

    if dense_output:
        expected = expected.toarray()
    assert_allclose_dense_sparse(actual, expected)
Beispiel #22
0
def test_32_equal_64(input_dtype, encode):
    # TODO this check is redundant with common checks and can be removed
    #  once #16290 is merged
    X_input = np.array(X, dtype=input_dtype)

    # 32 bit output
    kbd_32 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float32)
    kbd_32.fit(X_input)
    Xt_32 = kbd_32.transform(X_input)

    # 64 bit output
    kbd_64 = KBinsDiscretizer(n_bins=3, encode=encode, dtype=np.float64)
    kbd_64.fit(X_input)
    Xt_64 = kbd_64.transform(X_input)

    assert_allclose_dense_sparse(Xt_32, Xt_64)
Beispiel #23
0
def test_stacking_classifier_sparse_passthrough(fmt):
    # Check passthrough behavior on a sparse X matrix
    X_train, X_test, y_train, _ = train_test_split(
        sparse.coo_matrix(scale(X_iris)).asformat(fmt),
        y_iris, random_state=42
    )
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf = StackingClassifier(
        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
    )
    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
    assert sparse.issparse(X_trans)
    assert X_test.format == X_trans.format
Beispiel #24
0
def test_imputation_constant_float(array_constructor):
    # Test imputation using the constant strategy on floats
    X = np.array([[np.nan, 1.1, 0, np.nan], [1.2, np.nan, 1.3, np.nan],
                  [0, 0, np.nan, np.nan], [1.4, 1.5, 0, np.nan]])

    X_true = np.array([[-1, 1.1, 0, -1], [1.2, -1, 1.3, -1], [0, 0, -1, -1],
                       [1.4, 1.5, 0, -1]])

    X = array_constructor(X)

    X_true = array_constructor(X_true)

    imputer = SimpleImputer(strategy="constant", fill_value=-1)
    X_trans = imputer.fit_transform(X)

    assert_allclose_dense_sparse(X_trans, X_true)
def test_check_sparse_pandas_sp_format(sp_format):
    # check_array converts pandas dataframe with only sparse arrays into
    # sparse matrix
    pd = pytest.importorskip("pandas")
    sp_mat = _sparse_random_matrix(10, 3)

    sdf = pd.DataFrame.sparse.from_spmatrix(sp_mat)
    result = check_array(sdf, accept_sparse=sp_format)

    if sp_format is True:
        # by default pandas converts to coo when accept_sparse is True
        sp_format = 'coo'

    assert sp.issparse(result)
    assert result.format == sp_format
    assert_allclose_dense_sparse(sp_mat, result)
Beispiel #26
0
def test_stacking_regressor_sparse_passthrough(fmt):
    # Check passthrough behavior on a sparse X matrix
    X_train, X_test, y_train, _ = train_test_split(sparse.coo_matrix(
        scale(X_diabetes)).asformat(fmt),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [("lr", LinearRegression()), ("svr", LinearSVR())]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    clf = StackingRegressor(estimators=estimators,
                            final_estimator=rf,
                            cv=5,
                            passthrough=True)
    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
    assert sparse.issparse(X_trans)
    assert X_test.format == X_trans.format
Beispiel #27
0
def test_assert_allclose_dense_sparse():
    x = np.arange(9).reshape(3, 3)
    msg = "Not equal to tolerance "
    y = sparse.csc_matrix(x)
    for X in [x, y]:
        # basic compare
        assert_raise_message(AssertionError, msg, assert_allclose_dense_sparse,
                             X, X * 2)
        assert_allclose_dense_sparse(X, X)

    assert_raise_message(ValueError, "Can only compare two sparse",
                         assert_allclose_dense_sparse, x, y)

    A = sparse.diags(np.ones(5), offsets=0).tocsr()
    B = sparse.csr_matrix(np.ones((1, 5)))

    assert_raise_message(AssertionError, "Arrays are not equal",
                         assert_allclose_dense_sparse, B, A)
Beispiel #28
0
def test_check_dataframe_mixed_float_dtypes():
    # pandas dataframe will coerce a boolean into a object, this is a mismatch
    # with np.result_type which will return a float
    # check_array needs to explicitly check for bool dtype in a dataframe for
    # this situation
    # https://github.com/scikit-learn/scikit-learn/issues/15787

    pd = importorskip("pandas")
    df = pd.DataFrame(
        {"int": [1, 2, 3], "float": [0, 0.1, 2.1], "bool": [True, False, True]},
        columns=["int", "float", "bool"],
    )

    array = check_array(df, dtype=(np.float64, np.float32, np.float16))
    expected_array = np.array(
        [[1.0, 0.0, 1.0], [2.0, 0.1, 0.0], [3.0, 2.1, 1.0]], dtype=float
    )
    assert_allclose_dense_sparse(array, expected_array)
Beispiel #29
0
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type,
                                        indices):
    columns_name = ['col_0', 'col_1', 'col_2']
    array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type,
                               columns_name)

    if isinstance(indices, str) and array_type != 'dataframe':
        err_msg = ("Specifying the columns using strings is only supported "
                   "for pandas DataFrames")
        with pytest.raises(ValueError, match=err_msg):
            _safe_indexing(array, indices, axis=1)
    else:
        subset = _safe_indexing(array, indices, axis=1)
        expected_output = [3, 6, 9]
        if expected_output_type == 'sparse':
            # sparse matrix are keeping the 2D shape
            expected_output = [[3], [6], [9]]
        expected_array = _convert_container(expected_output,
                                            expected_output_type)
        assert_allclose_dense_sparse(subset, expected_array)
Beispiel #30
0
def test_check_fit_params(indices):
    X = np.random.randn(4, 2)
    fit_params = {
        "list": [1, 2, 3, 4],
        "array": np.array([1, 2, 3, 4]),
        "sparse-col": sp.csc_matrix([1, 2, 3, 4]).T,
        "sparse-row": sp.csc_matrix([1, 2, 3, 4]),
        "scalar-int": 1,
        "scalar-str": "xxx",
        "None": None,
    }
    result = _check_fit_params(X, fit_params, indices)
    indices_ = indices if indices is not None else list(range(X.shape[0]))

    for key in ["sparse-row", "scalar-int", "scalar-str", "None"]:
        assert result[key] is fit_params[key]

    assert result["list"] == _safe_indexing(fit_params["list"], indices_)
    assert_array_equal(result["array"], _safe_indexing(fit_params["array"], indices_))
    assert_allclose_dense_sparse(
        result["sparse-col"], _safe_indexing(fit_params["sparse-col"], indices_)
    )