Ejemplo n.º 1
0
def test_check_array_on_mock_dataframe():
    arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]])
    mock_df = MockDataFrame(arr)
    checked_arr = check_array(mock_df)
    assert checked_arr.dtype == arr.dtype
    checked_arr = check_array(mock_df, dtype=np.float32)
    assert checked_arr.dtype == np.dtype(np.float32)
Ejemplo n.º 2
0
def test_check_array_warn_on_dtype_deprecation():
    X = np.asarray([[0.0], [1.0]])
    Y = np.asarray([[2.0], [3.0]])
    with pytest.warns(DeprecationWarning,
                      match="'warn_on_dtype' is deprecated"):
        check_array(X, warn_on_dtype=True)
    with pytest.warns(DeprecationWarning,
                      match="'warn_on_dtype' is deprecated"):
        check_X_y(X, Y, warn_on_dtype=True)
Ejemplo n.º 3
0
def test_check_array_pandas_dtype_object_conversion():
    # test that data-frame like objects with dtype object
    # get converted
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.object)
    X_df = MockDataFrame(X)
    assert check_array(X_df).dtype.kind == "f"
    assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
    # smoke-test against dataframes with column named "dtype"
    X_df.dtype = "Hans"
    assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
Ejemplo n.º 4
0
def test_check_array_series():
    # regression test that check_array works on pandas Series
    pd = importorskip("pandas")
    res = check_array(pd.Series([1, 2, 3]), ensure_2d=False)
    assert_array_equal(res, np.array([1, 2, 3]))

    # with categorical dtype (not a numpy dtype) (GH12699)
    s = pd.Series(['a', 'b', 'c']).astype('category')
    res = check_array(s, dtype=None, ensure_2d=False)
    assert_array_equal(res, np.array(['a', 'b', 'c'], dtype=object))
Ejemplo n.º 5
0
def test_check_dataframe_warns_on_dtype():
    # Check that warn_on_dtype also works for DataFrames.
    # https://github.com/scikit-learn/scikit-learn/issues/10948
    pd = importorskip("pandas")

    df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], dtype=object)
    assert_warns_message(DataConversionWarning,
                         "Data with input dtype object were all converted to "
                         "float64.",
                         check_array,
                         df,
                         dtype=np.float64,
                         warn_on_dtype=True)
    assert_warns(DataConversionWarning,
                 check_array,
                 df,
                 dtype='numeric',
                 warn_on_dtype=True)
    with pytest.warns(None) as record:
        warnings.simplefilter("ignore", DeprecationWarning)  # 0.23
        check_array(df, dtype='object', warn_on_dtype=True)
    assert len(record) == 0

    # Also check that it raises a warning for mixed dtypes in a DataFrame.
    df_mixed = pd.DataFrame([['1', 2, 3], ['4', 5, 6]])
    assert_warns(DataConversionWarning,
                 check_array,
                 df_mixed,
                 dtype=np.float64,
                 warn_on_dtype=True)
    assert_warns(DataConversionWarning,
                 check_array,
                 df_mixed,
                 dtype='numeric',
                 warn_on_dtype=True)
    assert_warns(DataConversionWarning,
                 check_array,
                 df_mixed,
                 dtype=object,
                 warn_on_dtype=True)

    # Even with numerical dtypes, a conversion can be made because dtypes are
    # uniformized throughout the array.
    df_mixed_numeric = pd.DataFrame([[1., 2, 3], [4., 5, 6]])
    assert_warns(DataConversionWarning,
                 check_array,
                 df_mixed_numeric,
                 dtype='numeric',
                 warn_on_dtype=True)
    with pytest.warns(None) as record:
        warnings.simplefilter("ignore", DeprecationWarning)  # 0.23
        check_array(df_mixed_numeric.astype(int),
                    dtype='numeric',
                    warn_on_dtype=True)
    assert len(record) == 0
Ejemplo n.º 6
0
def test_check_array_force_all_finite_object():
    X = np.array([['a', 'b', np.nan]], dtype=object).T

    X_checked = check_array(X, dtype=None, force_all_finite='allow-nan')
    assert X is X_checked

    X_checked = check_array(X, dtype=None, force_all_finite=False)
    assert X is X_checked

    with pytest.raises(ValueError, match='Input contains NaN'):
        check_array(X, dtype=None, force_all_finite=True)
Ejemplo n.º 7
0
def test_check_array_accept_sparse_no_exception():
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)

    check_array(X_csr, accept_sparse=True)
    check_array(X_csr, accept_sparse='csr')
    check_array(X_csr, accept_sparse=['csr'])
    check_array(X_csr, accept_sparse=('csr', ))
Ejemplo n.º 8
0
def test_check_array_force_all_finite_valid(value, force_all_finite, retype):
    X = retype(np.arange(4).reshape(2, 2).astype(np.float))
    X[0, 0] = value
    X_checked = check_array(X,
                            force_all_finite=force_all_finite,
                            accept_sparse=True)
    assert_allclose_dense_sparse(X, X_checked)
Ejemplo n.º 9
0
def load_data(dtype=np.float32, order='C', random_state=13):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(np.int)

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 522911
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    # Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test
Ejemplo n.º 10
0
def test_check_input_false():
    X, y, _, _ = build_dataset(n_samples=20, n_features=10)
    X = check_array(X, order='F', dtype='float64')
    y = check_array(X, order='F', dtype='float64')
    clf = ElasticNet(selection='cyclic', tol=1e-8)
    # Check that no error is raised if data is provided in the right format
    clf.fit(X, y, check_input=False)
    # With check_input=False, an exhaustive check is not made on y but its
    # dtype is still cast in _preprocess_data to X's dtype. So the test should
    # pass anyway
    X = check_array(X, order='F', dtype='float32')
    clf.fit(X, y, check_input=False)
    # With no input checking, providing X in C order should result in false
    # computation
    X = check_array(X, order='C', dtype='float64')
    assert_raises(ValueError, clf.fit, X, y, check_input=False)
Ejemplo n.º 11
0
def test_ordering():
    # Check that ordering is enforced correctly by validation utilities.
    # We need to check each validation utility, because a 'copy' without
    # 'order=K' will kill the ordering.
    X = np.ones((10, 5))
    for A in X, X.T:
        for copy in (True, False):
            B = check_array(A, order='C', copy=copy)
            assert B.flags['C_CONTIGUOUS']
            B = check_array(A, order='F', copy=copy)
            assert B.flags['F_CONTIGUOUS']
            if copy:
                assert A is not B

    X = sp.csr_matrix(X)
    X.data = X.data[::-1]
    assert not X.data.flags['C_CONTIGUOUS']
Ejemplo n.º 12
0
def test_sparse_encode_input():
    n_components = 100
    rng = np.random.RandomState(0)
    V = rng.randn(n_components, n_features)  # random init
    V /= np.sum(V**2, axis=1)[:, np.newaxis]
    Xf = check_array(X, order='F')
    for algo in ('lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'):
        a = sparse_encode(X, V, algorithm=algo)
        b = sparse_encode(Xf, V, algorithm=algo)
        assert_array_almost_equal(a, b)
Ejemplo n.º 13
0
def load_data(dtype=np.float32, order='C', shuffle=True, seed=0):
    """Load the data, then cache and memmap the train/test split"""
    print("Loading dataset...")
    data = fetch_openml('mnist_784')

    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    if shuffle:
        X, y = _shuffle(X, y, random_state=seed)

    # Normalize features
    X /= 255
    return X, y
Ejemplo n.º 14
0
    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
        X = check_array(X, accept_sparse=('csr', 'csc'))
        check_non_negative(X, "NMF (input X)")

        n_samples, n_features = X.shape
        n_components = self.n_components
        if n_components is None:
            n_components = n_features

        if (not isinstance(n_components, numbers.Integral) or
                n_components <= 0):
            raise ValueError("Number of components must be a positive integer;"
                             " got (n_components=%r)" % n_components)
        if (not isinstance(self.max_iter, numbers.Integral) or
                self.max_iter < 0):
            raise ValueError("Maximum number of iterations must be a positive "
                             "integer; got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)

        # check W and H, or initialize them
        if self.init == 'custom' and update_H:
            _check_init(H, (n_components, n_features), "NMF (input H)")
            _check_init(W, (n_samples, n_components), "NMF (input W)")
        elif not update_H:
            _check_init(H, (n_components, n_features), "NMF (input H)")
            W = np.zeros((n_samples, n_components))
        else:
            W, H = _initialize_nmf(X, n_components, init=self.init,
                                   random_state=self.random_state)

        if update_H:  # fit_transform
            W, H, n_iter = _fit_projected_gradient(
                X, W, H, self.tol, self.max_iter, self.nls_max_iter,
                self.alpha, self.l1_ratio)
        else:  # transform
            Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol,
                                            self.nls_max_iter,
                                            alpha=self.alpha,
                                            l1_ratio=self.l1_ratio)
            W = Wt.T

        if n_iter == self.max_iter and self.tol > 0:
            warnings.warn("Maximum number of iteration %d reached. Increase it"
                          " to improve convergence." % self.max_iter,
                          ConvergenceWarning)

        return W, H, n_iter
Ejemplo n.º 15
0
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_openml('mnist_784')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test
Ejemplo n.º 16
0
def test_check_array_dtype_stability():
    # test that lists with ints don't get converted to floats
    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    assert check_array(X).dtype.kind == "i"
    assert check_array(X, ensure_2d=False).dtype.kind == "i"
Ejemplo n.º 17
0
 def fit(self, X, y=None):
     X = check_array(X, 'csc')
     self.n_input_feats = X.shape[1]
     return self
Ejemplo n.º 18
0
def test_check_array_memmap(copy):
    X = np.ones((4, 4))
    with TempMemmap(X, mmap_mode='r') as X_memmap:
        X_checked = check_array(X_memmap, copy=copy)
        assert np.may_share_memory(X_memmap, X_checked) == (not copy)
        assert X_checked.flags['WRITEABLE'] == copy
Ejemplo n.º 19
0
def test_check_array_force_all_finiteinvalid(value, force_all_finite,
                                             match_msg, retype):
    X = retype(np.arange(4).reshape(2, 2).astype(np.float))
    X[0, 0] = value
    with pytest.raises(ValueError, match=match_msg):
        check_array(X, force_all_finite=force_all_finite, accept_sparse=True)
Ejemplo n.º 20
0
def test_check_array_accept_large_sparse_no_exception(X_64bit):
    # When large sparse are allowed
    check_array(X_64bit, accept_large_sparse=True, accept_sparse=True)
Ejemplo n.º 21
0
def test_check_array():
    # accept_sparse == False
    # raise error on sparse inputs
    X = [[1, 2], [3, 4]]
    X_csr = sp.csr_matrix(X)
    assert_raises(TypeError, check_array, X_csr)
    # ensure_2d=False
    X_array = check_array([0, 1, 2], ensure_2d=False)
    assert X_array.ndim == 1
    # ensure_2d=True with 1d array
    assert_raise_message(ValueError,
                         'Expected 2D array, got 1D array instead',
                         check_array, [0, 1, 2],
                         ensure_2d=True)
    # ensure_2d=True with scalar array
    assert_raise_message(ValueError,
                         'Expected 2D array, got scalar array instead',
                         check_array,
                         10,
                         ensure_2d=True)
    # don't allow ndim > 3
    X_ndim = np.arange(8).reshape(2, 2, 2)
    assert_raises(ValueError, check_array, X_ndim)
    check_array(X_ndim, allow_nd=True)  # doesn't raise

    # dtype and order enforcement.
    X_C = np.arange(4).reshape(2, 2).copy("C")
    X_F = X_C.copy("F")
    X_int = X_C.astype(np.int)
    X_float = X_C.astype(np.float)
    Xs = [X_C, X_F, X_int, X_float]
    dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object]
    orders = ['C', 'F', None]
    copys = [True, False]

    for X, dtype, order, copy in product(Xs, dtypes, orders, copys):
        X_checked = check_array(X, dtype=dtype, order=order, copy=copy)
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if order == 'C':
            assert X_checked.flags['C_CONTIGUOUS']
            assert not X_checked.flags['F_CONTIGUOUS']
        elif order == 'F':
            assert X_checked.flags['F_CONTIGUOUS']
            assert not X_checked.flags['C_CONTIGUOUS']
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS']
                    == X.flags['C_CONTIGUOUS']
                    and X_checked.flags['F_CONTIGUOUS']
                    == X.flags['F_CONTIGUOUS']):
                assert X is X_checked

    # allowed sparse != None
    X_csc = sp.csc_matrix(X_C)
    X_coo = X_csc.tocoo()
    X_dok = X_csc.todok()
    X_int = X_csc.astype(np.int)
    X_float = X_csc.astype(np.float)

    Xs = [X_csc, X_coo, X_dok, X_int, X_float]
    accept_sparses = [['csr', 'coo'], ['coo', 'dok']]
    for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses,
                                                 copys):
        with warnings.catch_warnings(record=True) as w:
            X_checked = check_array(X,
                                    dtype=dtype,
                                    accept_sparse=accept_sparse,
                                    copy=copy)
        if (dtype is object or sp.isspmatrix_dok(X)) and len(w):
            message = str(w[0].message)
            messages = [
                "object dtype is not supported by sparse matrices",
                "Can't check dok sparse matrix for nan or inf."
            ]
            assert message in messages
        else:
            assert len(w) == 0
        if dtype is not None:
            assert X_checked.dtype == dtype
        else:
            assert X_checked.dtype == X.dtype
        if X.format in accept_sparse:
            # no change if allowed
            assert X.format == X_checked.format
        else:
            # got converted
            assert X_checked.format == accept_sparse[0]
        if copy:
            assert X is not X_checked
        else:
            # doesn't copy if it was already good
            if X.dtype == X_checked.dtype and X.format == X_checked.format:
                assert X is X_checked

    # other input formats
    # convert lists to arrays
    X_dense = check_array([[1, 2], [3, 4]])
    assert isinstance(X_dense, np.ndarray)
    # raise on too deep lists
    assert_raises(ValueError, check_array, X_ndim.tolist())
    check_array(X_ndim.tolist(), allow_nd=True)  # doesn't raise
    # convert weird stuff to arrays
    X_no_array = NotAnArray(X_dense)
    result = check_array(X_no_array)
    assert isinstance(result, np.ndarray)

    # deprecation warning if string-like array with dtype="numeric"
    expected_warn_regex = r"converted to decimal numbers if dtype='numeric'"
    X_str = [['11', '12'], ['13', 'xx']]
    for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]:
        with pytest.warns(FutureWarning, match=expected_warn_regex):
            check_array(X, dtype="numeric")

    # deprecation warning if byte-like array with dtype="numeric"
    X_bytes = [[b'a', b'b'], [b'c', b'd']]
    for X in [X_bytes, np.array(X_bytes, dtype='V1')]:
        with pytest.warns(FutureWarning, match=expected_warn_regex):
            check_array(X, dtype="numeric")
Ejemplo n.º 22
0
def test_check_array_dtype_warning():
    X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    X_float64 = np.asarray(X_int_list, dtype=np.float64)
    X_float32 = np.asarray(X_int_list, dtype=np.float32)
    X_int64 = np.asarray(X_int_list, dtype=np.int64)
    X_csr_float64 = sp.csr_matrix(X_float64)
    X_csr_float32 = sp.csr_matrix(X_float32)
    X_csc_float32 = sp.csc_matrix(X_float32)
    X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32)
    y = [0, 0, 1]
    integer_data = [X_int64, X_csc_int32]
    float64_data = [X_float64, X_csr_float64]
    float32_data = [X_float32, X_csr_float32, X_csc_float32]
    for X in integer_data:
        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=np.float64,
                                       accept_sparse=True)
        assert X_checked.dtype == np.float64

        X_checked = assert_warns(DataConversionWarning,
                                 check_array,
                                 X,
                                 dtype=np.float64,
                                 accept_sparse=True,
                                 warn_on_dtype=True)
        assert X_checked.dtype == np.float64

        # Check that the warning message includes the name of the Estimator
        X_checked = assert_warns_message(DataConversionWarning,
                                         'SomeEstimator',
                                         check_array,
                                         X,
                                         dtype=[np.float64, np.float32],
                                         accept_sparse=True,
                                         warn_on_dtype=True,
                                         estimator='SomeEstimator')
        assert X_checked.dtype == np.float64

        X_checked, y_checked = assert_warns_message(
            DataConversionWarning,
            'KNeighborsClassifier',
            check_X_y,
            X,
            y,
            dtype=np.float64,
            accept_sparse=True,
            warn_on_dtype=True,
            estimator=KNeighborsClassifier())

        assert X_checked.dtype == np.float64

    for X in float64_data:
        with pytest.warns(None) as record:
            warnings.simplefilter("ignore", DeprecationWarning)  # 0.23
            X_checked = check_array(X,
                                    dtype=np.float64,
                                    accept_sparse=True,
                                    warn_on_dtype=True)
            assert X_checked.dtype == np.float64
            X_checked = check_array(X,
                                    dtype=np.float64,
                                    accept_sparse=True,
                                    warn_on_dtype=False)
            assert X_checked.dtype == np.float64
        assert len(record) == 0

    for X in float32_data:
        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=[np.float64, np.float32],
                                       accept_sparse=True)
        assert X_checked.dtype == np.float32
        assert X_checked is X

        X_checked = assert_no_warnings(check_array,
                                       X,
                                       dtype=[np.float64, np.float32],
                                       accept_sparse=['csr', 'dok'],
                                       copy=True)
        assert X_checked.dtype == np.float32
        assert X_checked is not X

    X_checked = assert_no_warnings(check_array,
                                   X_csc_float32,
                                   dtype=[np.float64, np.float32],
                                   accept_sparse=['csr', 'dok'],
                                   copy=False)
    assert X_checked.dtype == np.float32
    assert X_checked is not X_csc_float32
    assert X_checked.format == 'csr'