def test_check_array_warn_on_dtype_deprecation(): X = np.asarray([[0.0], [1.0]]) Y = np.asarray([[2.0], [3.0]]) with pytest.warns(FutureWarning, match="'warn_on_dtype' is deprecated"): check_array(X, warn_on_dtype=True) with pytest.warns(FutureWarning, match="'warn_on_dtype' is deprecated"): check_X_y(X, Y, warn_on_dtype=True)
def test_check_array_on_mock_dataframe(): arr = np.array([[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]) mock_df = MockDataFrame(arr) checked_arr = check_array(mock_df) assert checked_arr.dtype == arr.dtype checked_arr = check_array(mock_df, dtype=np.float32) assert checked_arr.dtype == np.dtype(np.float32)
def test_check_array_series(): # regression test that check_array works on pandas Series pd = importorskip("pandas") res = check_array(pd.Series([1, 2, 3]), ensure_2d=False) assert_array_equal(res, np.array([1, 2, 3])) # with categorical dtype (not a numpy dtype) (GH12699) s = pd.Series(['a', 'b', 'c']).astype('category') res = check_array(s, dtype=None, ensure_2d=False) assert_array_equal(res, np.array(['a', 'b', 'c'], dtype=object))
def test_check_array_pandas_dtype_object_conversion(): # test that data-frame like objects with dtype object # get converted X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.object) X_df = MockDataFrame(X) assert check_array(X_df).dtype.kind == "f" assert check_array(X_df, ensure_2d=False).dtype.kind == "f" # smoke-test against dataframes with column named "dtype" X_df.dtype = "Hans" assert check_array(X_df, ensure_2d=False).dtype.kind == "f"
def test_check_dataframe_warns_on_dtype(): # Check that warn_on_dtype also works for DataFrames. # https://github.com/scikit-learn/scikit-learn/issues/10948 pd = importorskip("pandas") df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], dtype=object) assert_warns_message(DataConversionWarning, "Data with input dtype object were all converted to " "float64.", check_array, df, dtype=np.float64, warn_on_dtype=True) assert_warns(DataConversionWarning, check_array, df, dtype='numeric', warn_on_dtype=True) with pytest.warns(None) as record: warnings.simplefilter("ignore", FutureWarning) # 0.23 check_array(df, dtype='object', warn_on_dtype=True) assert len(record) == 0 # Also check that it raises a warning for mixed dtypes in a DataFrame. df_mixed = pd.DataFrame([['1', 2, 3], ['4', 5, 6]]) assert_warns(DataConversionWarning, check_array, df_mixed, dtype=np.float64, warn_on_dtype=True) assert_warns(DataConversionWarning, check_array, df_mixed, dtype='numeric', warn_on_dtype=True) assert_warns(DataConversionWarning, check_array, df_mixed, dtype=object, warn_on_dtype=True) # Even with numerical dtypes, a conversion can be made because dtypes are # uniformized throughout the array. df_mixed_numeric = pd.DataFrame([[1., 2, 3], [4., 5, 6]]) assert_warns(DataConversionWarning, check_array, df_mixed_numeric, dtype='numeric', warn_on_dtype=True) with pytest.warns(None) as record: warnings.simplefilter("ignore", FutureWarning) # 0.23 check_array(df_mixed_numeric.astype(int), dtype='numeric', warn_on_dtype=True) assert len(record) == 0
def test_check_array_force_all_finite_object(): X = np.array([['a', 'b', np.nan]], dtype=object).T X_checked = check_array(X, dtype=None, force_all_finite='allow-nan') assert X is X_checked X_checked = check_array(X, dtype=None, force_all_finite=False) assert X is X_checked with pytest.raises(ValueError, match='Input contains NaN'): check_array(X, dtype=None, force_all_finite=True)
def test_check_array_accept_sparse_no_exception(): X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) check_array(X_csr, accept_sparse=True) check_array(X_csr, accept_sparse='csr') check_array(X_csr, accept_sparse=['csr']) check_array(X_csr, accept_sparse=('csr', ))
def test_check_array_force_all_finite_valid(value, force_all_finite, retype): X = retype(np.arange(4).reshape(2, 2).astype(np.float)) X[0, 0] = value X_checked = check_array(X, force_all_finite=force_all_finite, accept_sparse=True) assert_allclose_dense_sparse(X, X_checked)
def test_check_input_false(): X, y, _, _ = build_dataset(n_samples=20, n_features=10) X = check_array(X, order='F', dtype='float64') y = check_array(X, order='F', dtype='float64') clf = ElasticNet(selection='cyclic', tol=1e-8) # Check that no error is raised if data is provided in the right format clf.fit(X, y, check_input=False) # With check_input=False, an exhaustive check is not made on y but its # dtype is still cast in _preprocess_data to X's dtype. So the test should # pass anyway X = check_array(X, order='F', dtype='float32') clf.fit(X, y, check_input=False) # With no input checking, providing X in C order should result in false # computation X = check_array(X, order='C', dtype='float64') assert_raises(ValueError, clf.fit, X, y, check_input=False)
def test_ordering(): # Check that ordering is enforced correctly by validation utilities. # We need to check each validation utility, because a 'copy' without # 'order=K' will kill the ordering. X = np.ones((10, 5)) for A in X, X.T: for copy in (True, False): B = check_array(A, order='C', copy=copy) assert B.flags['C_CONTIGUOUS'] B = check_array(A, order='F', copy=copy) assert B.flags['F_CONTIGUOUS'] if copy: assert A is not B X = sp.csr_matrix(X) X.data = X.data[::-1] assert not X.data.flags['C_CONTIGUOUS']
def test_sparse_encode_input(): n_components = 100 rng = np.random.RandomState(0) V = rng.randn(n_components, n_features) # random init V /= np.sum(V**2, axis=1)[:, np.newaxis] Xf = check_array(X, order='F') for algo in ('lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'): a = sparse_encode(X, V, algorithm=algo) b = sparse_encode(Xf, V, algorithm=algo) assert_array_almost_equal(a, b)
def test_check_dataframe_mixed_float_dtypes(): # pandas dataframe will coerce a boolean into a object, this is a mismatch # with np.result_type which will return a float # check_array needs to explicitly check for bool dtype in a dataframe for # this situation # https://github.com/scikit-learn/scikit-learn/issues/15787 pd = importorskip("pandas") df = pd.DataFrame( { 'int': [1, 2, 3], 'float': [0, 0.1, 2.1], 'bool': [True, False, True] }, columns=['int', 'float', 'bool']) array = check_array(df, dtype=(np.float64, np.float32, np.float16)) expected_array = np.array( [[1.0, 0.0, 1.0], [2.0, 0.1, 0.0], [3.0, 2.1, 1.0]], dtype=np.float) assert_allclose_dense_sparse(array, expected_array)
def test_check_array_memmap(copy): X = np.ones((4, 4)) with TempMemmap(X, mmap_mode='r') as X_memmap: X_checked = check_array(X_memmap, copy=copy) assert np.may_share_memory(X_memmap, X_checked) == (not copy) assert X_checked.flags['WRITEABLE'] == copy
def test_check_array_force_all_finiteinvalid(value, force_all_finite, match_msg, retype): X = retype(np.arange(4).reshape(2, 2).astype(np.float)) X[0, 0] = value with pytest.raises(ValueError, match=match_msg): check_array(X, force_all_finite=force_all_finite, accept_sparse=True)
def test_check_array_force_all_finite_object_unsafe_casting( X, err_msg, force_all_finite): # casting a float array containing NaN or inf to int dtype should # raise an error irrespective of the force_all_finite parameter. with pytest.raises(ValueError, match=err_msg): check_array(X, dtype=np.int, force_all_finite=force_all_finite)
def test_check_array_accept_large_sparse_no_exception(X_64bit): # When large sparse are allowed check_array(X_64bit, accept_large_sparse=True, accept_sparse=True)
def test_check_array(): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) assert_raises(TypeError, check_array, X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) assert X_array.ndim == 1 # ensure_2d=True with 1d array assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, [0, 1, 2], ensure_2d=True) # ensure_2d=True with scalar array assert_raise_message(ValueError, 'Expected 2D array, got scalar array instead', check_array, 10, ensure_2d=True) # don't allow ndim > 3 X_ndim = np.arange(8).reshape(2, 2, 2) assert_raises(ValueError, check_array, X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = np.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(np.int) X_float = X_C.astype(np.float) Xs = [X_C, X_F, X_int, X_float] dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if order == 'C': assert X_checked.flags['C_CONTIGUOUS'] assert not X_checked.flags['F_CONTIGUOUS'] elif order == 'F': assert X_checked.flags['F_CONTIGUOUS'] assert not X_checked.flags['C_CONTIGUOUS'] if copy: assert X is not X_checked else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert X is X_checked # allowed sparse != None X_csc = sp.csc_matrix(X_C) X_coo = X_csc.tocoo() X_dok = X_csc.todok() X_int = X_csc.astype(np.int) X_float = X_csc.astype(np.float) Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, copys): with warnings.catch_warnings(record=True) as w: X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if (dtype is object or sp.isspmatrix_dok(X)) and len(w): message = str(w[0].message) messages = [ "object dtype is not supported by sparse matrices", "Can't check dok sparse matrix for nan or inf." ] assert message in messages else: assert len(w) == 0 if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if X.format in accept_sparse: # no change if allowed assert X.format == X_checked.format else: # got converted assert X_checked.format == accept_sparse[0] if copy: assert X is not X_checked else: # doesn't copy if it was already good if X.dtype == X_checked.dtype and X.format == X_checked.format: assert X is X_checked # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert isinstance(X_dense, np.ndarray) # raise on too deep lists assert_raises(ValueError, check_array, X_ndim.tolist()) check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = _NotAnArray(X_dense) result = check_array(X_no_array) assert isinstance(result, np.ndarray) # deprecation warning if string-like array with dtype="numeric" expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" X_str = [['11', '12'], ['13', 'xx']] for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] for X in [X_bytes, np.array(X_bytes, dtype='V1')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric")
def test_check_array_dtype_warning(): X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] X_float64 = np.asarray(X_int_list, dtype=np.float64) X_float32 = np.asarray(X_int_list, dtype=np.float32) X_int64 = np.asarray(X_int_list, dtype=np.int64) X_csr_float64 = sp.csr_matrix(X_float64) X_csr_float32 = sp.csr_matrix(X_float32) X_csc_float32 = sp.csc_matrix(X_float32) X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32) y = [0, 0, 1] integer_data = [X_int64, X_csc_int32] float64_data = [X_float64, X_csr_float64] float32_data = [X_float32, X_csr_float32, X_csc_float32] for X in integer_data: X_checked = assert_no_warnings(check_array, X, dtype=np.float64, accept_sparse=True) assert X_checked.dtype == np.float64 X_checked = assert_warns(DataConversionWarning, check_array, X, dtype=np.float64, accept_sparse=True, warn_on_dtype=True) assert X_checked.dtype == np.float64 # Check that the warning message includes the name of the Estimator X_checked = assert_warns_message(DataConversionWarning, 'SomeEstimator', check_array, X, dtype=[np.float64, np.float32], accept_sparse=True, warn_on_dtype=True, estimator='SomeEstimator') assert X_checked.dtype == np.float64 X_checked, y_checked = assert_warns_message( DataConversionWarning, 'KNeighborsClassifier', check_X_y, X, y, dtype=np.float64, accept_sparse=True, warn_on_dtype=True, estimator=KNeighborsClassifier()) assert X_checked.dtype == np.float64 for X in float64_data: with pytest.warns(None) as record: warnings.simplefilter("ignore", FutureWarning) # 0.23 X_checked = check_array(X, dtype=np.float64, accept_sparse=True, warn_on_dtype=True) assert X_checked.dtype == np.float64 X_checked = check_array(X, dtype=np.float64, accept_sparse=True, warn_on_dtype=False) assert X_checked.dtype == np.float64 assert len(record) == 0 for X in float32_data: X_checked = assert_no_warnings(check_array, X, dtype=[np.float64, np.float32], accept_sparse=True) assert X_checked.dtype == np.float32 assert X_checked is X X_checked = assert_no_warnings(check_array, X, dtype=[np.float64, np.float32], accept_sparse=['csr', 'dok'], copy=True) assert X_checked.dtype == np.float32 assert X_checked is not X X_checked = assert_no_warnings(check_array, X_csc_float32, dtype=[np.float64, np.float32], accept_sparse=['csr', 'dok'], copy=False) assert X_checked.dtype == np.float32 assert X_checked is not X_csc_float32 assert X_checked.format == 'csr'
def test_check_array_dtype_stability(): # test that lists with ints don't get converted to floats X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] assert check_array(X).dtype.kind == "i" assert check_array(X, ensure_2d=False).dtype.kind == "i"
def fit(self, X, y=None): X = check_array(X, 'csc') self.n_input_feats = X.shape[1] return self
def test_check_array_pandas_dtype_casting(): # test that data-frames with homogeneous dtype are not upcast pd = pytest.importorskip('pandas') X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) X_df = pd.DataFrame(X) assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16) assert_array_equal(X_df.dtypes, (np.float16, np.float32, np.float32)) assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16) # float16, int16, float32 casts to float32 assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16) # float16, int16, float16 casts to float32 assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df = X_df.astype(np.int16) assert check_array(X_df).dtype == np.int16 # we're not using upcasting rules for determining # the target type yet, so we cast to the default of float64 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64 # check that we handle pandas dtypes in a semi-reasonable way # this is actually tricky because we can't really know that this # should be integer ahead of converting it. cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])]) assert (check_array(cat_df).dtype == np.int64) assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype == np.float64)