def _dispatch_arg_type(self, function, args): comm_class = _DenseNCCLCommunicator if ((isinstance(args[0], (list, tuple)) and sparse.issparse(args[0][0])) or sparse.issparse(args[0])): comm_class = _SparseNCCLCommunicator getattr(comm_class, function)(self, *args)
def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): """Check allclose for sparse and dense data. Both x and y need to be either sparse or dense, they can't be mixed. Parameters ---------- x : array-like or sparse matrix First array to compare. y : array-like or sparse matrix Second array to compare. rtol : float, optional relative tolerance; see numpy.allclose atol : float, optional absolute tolerance; see numpy.allclose. Note that the default here is more tolerant than the default for numpy.testing.assert_allclose, where atol=0. """ if sp.issparse(x) and sp.issparse(y): x = x.tocsr() y = y.tocsr() x.sum_duplicates() y.sum_duplicates() return (cp.array_equal(x.indices, y.indices) and cp.array_equal(x.indptr, y.indptr) and cp.allclose(x.data, y.data, rtol=rtol, atol=atol)) elif not sp.issparse(x) and not sp.issparse(y): return cp.allclose(x, y, rtol=rtol, atol=atol) raise ValueError("Can only compare two sparse matrices, not a sparse " "matrix and an array")
def sparse_matmul2(a, b): """ Matmul on two sparse or non-sparse matrices. This function calcualtes gradient only on the non-sparse matrices. """ if issparse(a) and issparse(b): return a.dot(b) elif issparse(a): return SparseMatmul2(a, None)(b) elif issparse(b): return SparseMatmul2(None, b)(a) else: return SparseMatmul2(None, None)(a, b)
def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" X = self._validate_data(X, reset=in_fit, accept_sparse=('csc', 'csr'), dtype=None, force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("MissingIndicator does not support data with " "dtype {0}. Please provide either a numeric array" " (with a floating point or integer dtype) or " "categorical data represented either as an array " "with integer dtype or an array of string values " "with an object dtype.".format(X.dtype)) if sparse.issparse(X) and self.missing_values == 0: # missing_values = 0 not allowed with sparse data as it would # force densification raise ValueError("Sparse input with missing_values=0 is " "not supported. Provide a dense " "array instead.") return X
def transform(self, X) -> SparseCumlArray: """Impute all missing values in X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. """ check_is_fitted(self) X = self._validate_input(X, in_fit=False) X_indicator = super()._transform_indicator(X) statistics = self.statistics_ if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) # Delete the invalid columns if strategy is not constant if self.strategy == "constant": valid_statistics = statistics else: # same as np.isnan but also works for object dtypes invalid_mask = _get_mask(statistics, np.nan) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) if invalid_mask.any(): missing = np.arange(X.shape[1])[invalid_mask] if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] # Do actual imputation if sparse.issparse(X): if self.missing_values == 0: raise ValueError("Imputation not possible when missing_values " "== 0 and input is sparse. Provide a dense " "array instead.") else: mask = _get_mask(X.data, self.missing_values) indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr).tolist())[mask] X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False) else: mask = _get_mask(X, self.missing_values) if self.strategy == "constant": X[mask] = valid_statistics[0] else: for i, vi in enumerate(valid_statistics_indexes): feature_idxs = np.flatnonzero(mask[:, vi]) X[feature_idxs, vi] = valid_statistics[i] X = super()._concatenate_indicator(X, X_indicator) return X
def row_norms(X, squared=False): """Row-wise (squared) Euclidean norm of X. Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse matrices. Performs no input validation. Parameters ---------- X : array_like The input array squared : bool, optional (default = False) If True, return squared norms. Returns ------- array_like The row-wise (squared) Euclidean norm of X. """ if sparse.issparse(X): if isinstance( X, (sparse.csr_matrix, sparse.csc_matrix, sparse.coo_matrix)): X_copy = X.copy() X_copy.data = np.square(X_copy.data) norms = X_copy.sum(axis=1).squeeze() else: raise ValueError('Sparse matrix not compatible') else: norms = np.einsum('ij,ij->i', X, X) if not squared: np.sqrt(norms, norms) return norms
def _get_missing_features_info(self, X): """Compute the imputer mask and the indices of the features containing missing values. Parameters ---------- X : {ndarray or sparse matrix}, shape (n_samples, n_features) The input data with missing values. Note that ``X`` has been checked in ``fit`` and ``transform`` before to call this function. Returns ------- imputer_mask : {ndarray or sparse matrix}, shape \ (n_samples, n_features) The imputer mask of the original data. features_with_missing : ndarray, shape (n_features_with_missing) The features containing missing values. """ if sparse.issparse(X): mask = _get_mask(X.data, self.missing_values) # The imputer mask will be constructed with the same sparse format # as X. sparse_constructor = (sparse.csr_matrix if X.format == 'csr' else sparse.csc_matrix) imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=np.float32) # temporarly switch to using float32 as # cupy cannot operate with bool as of now if self.features == 'missing-only': n_missing = imputer_mask.sum(axis=0) if self.sparse is False: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() else: imputer_mask = _get_mask(X, self.missing_values) if self.features == 'missing-only': n_missing = imputer_mask.sum(axis=0) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) if self.features == 'all': features_indices = np.arange(X.shape[1]) else: features_indices = np.flatnonzero(n_missing) return imputer_mask, features_indices
def fit(self, X, y=None) -> "SimpleImputer": """Fit the imputer on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : SimpleImputer """ if type(X) is list: X = np.asarray(X) X = self._validate_input(X, in_fit=True) super()._fit_indicator(X) # default fill_value is 0 for numerical input and "missing_value" # otherwise if self.fill_value is None: if X.dtype.kind in ("i", "u", "f"): fill_value = 0 else: fill_value = "missing_value" else: fill_value = self.fill_value # fill_value should be numerical in case of numerical input if (self.strategy == "constant" and X.dtype.kind in ("i", "u", "f") and not isinstance(fill_value, numbers.Real)): raise ValueError("'fill_value'={0} is invalid. Expected a " "numerical value when imputing numerical " "data".format(fill_value)) if sparse.issparse(X): # missing_values = 0 not allowed with sparse data as it would # force densification if self.missing_values == 0: raise ValueError("Imputation not possible when missing_values " "== 0 and input is sparse. Provide a dense " "array instead.") else: self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, fill_value) else: self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, fill_value) return self
def _concatenate_indicator(self, X_imputed, X_indicator): """Concatenate indicator mask with the imputed data.""" if not self.add_indicator: return X_imputed hstack = sparse.hstack if sparse.issparse(X_imputed) else np.hstack if X_indicator is None: raise ValueError( "Data from the missing indicator are not provided. Call " "_fit_indicator and _transform_indicator in the imputer " "implementation.") return hstack((X_imputed, X_indicator))
def check_sparse(array, accept_sparse=False, accept_large_sparse=True): """Checks that the sparse array is valid Parameters ---------- accept_sparse : string, boolean or list/tuple of strings (default=False) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. If the input is sparse but not in the allowed format, it will be converted to the first listed format. True allows the input to be any format. False means that a sparse matrix input will raise an error. accept_large_sparse : bool (default=True) If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by accept_sparse, accept_large_sparse=False will cause it to be accepted only if its indices are stored with a 32-bit dtype. Returns ------- None or raise error """ if accept_sparse is True: return err_msg = "This algorithm does not support the sparse " + \ "input in the current configuration." is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array) if is_sparse: if accept_sparse is False: raise ValueError(err_msg) if not accept_large_sparse: if array.indices.dtype != cp.int32 or \ array.indptr.dtype != cp.int32: raise ValueError(err_msg) if isinstance(accept_sparse, (tuple, list)): if array.format not in accept_sparse: raise ValueError(err_msg) elif array.format != accept_sparse: raise ValueError(err_msg)
def to_output_type(array, output_type, order='F'): """Used to convert arrays while creating datasets for testing. Parameters ---------- array : array Input array to convert output_type : string Type of to convert to Returns ------- Converted array """ if output_type == 'scipy_csr': return cpu_sparse.csr_matrix(array.get()) if output_type == 'scipy_csc': return cpu_sparse.csc_matrix(array.get()) if output_type == 'scipy_coo': return cpu_sparse.coo_matrix(array.get()) if output_type == 'cupy_csr': if array.format in ['csc', 'coo']: return array.tocsr() else: return array if output_type == 'cupy_csc': if array.format in ['csr', 'coo']: return array.tocsc() else: return array if output_type == 'cupy_coo': if array.format in ['csr', 'csc']: return array.tocoo() else: return array if cpu_sparse.issparse(array): if output_type == 'numpy': return array.todense() elif output_type == 'cupy': return cp.array(array.todense()) else: array = array.todense() elif gpu_sparse.issparse(array): if output_type == 'numpy': return array.get().todense() elif output_type == 'cupy': return array.todense() else: array = array.todense() cuml_array = input_to_cuml_array(array, order=order)[0] if output_type == 'series' and len(array.shape) > 1: output_type = 'cudf' output = cuml_array.to_output(output_type) if output_type in ['dataframe', 'cudf']: renaming = {i: 'c' + str(i) for i in range(output.shape[1])} output = output.rename(columns=renaming) return output
def check_array(array, accept_sparse=False, accept_large_sparse=True, dtype='numeric', order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=None, estimator=None): """Input validation on an array, list, sparse matrix or similar. By default, the input is checked to be a non-empty 2D array containing only finite values. If the dtype of the array is object, attempt converting to float, raising on failure. Parameters ---------- array : object Input object to check / convert. accept_sparse : string, boolean or list/tuple of strings (default=False) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. If the input is sparse but not in the allowed format, it will be converted to the first listed format. True allows the input to be any format. False means that a sparse matrix input will raise an error. accept_large_sparse : bool (default=True) If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by accept_sparse, accept_large_sparse=False will cause it to be accepted only if its indices are stored with a 32-bit dtype. dtype : string, type, list of types or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. When order is None (default), then if copy=False, nothing is ensured about the memory layout of the output array; otherwise (copy=True) the memory layout of the returned array is kept as close as possible to the original array. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - False: accepts np.inf, np.nan, pd.NA in array. - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. ``force_all_finite`` accepts the string ``'allow-nan'``. ensure_2d : boolean (default=True) Whether to raise a value error if array is not 2D. allow_nd : boolean (default=False) Whether to allow array.ndim > 2. ensure_min_samples : int (default=1) Make sure that the array has a minimum number of samples in its first axis (rows for a 2D array). Setting to 0 disables this check. ensure_min_features : int (default=1) Make sure that the 2D array has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when the input data has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. estimator : unused parameter Returns ------- array_converted : object The converted and validated array. """ if dtype == 'numeric': dtype = numeric_types correct_dtype = check_dtype(array, dtype) if (not isinstance(array, (pdDataFrame, cuDataFrame)) and copy and not order and hasattr(array, 'flags')): if array.flags['F_CONTIGUOUS']: order = 'F' elif array.flags['C_CONTIGUOUS']: order = 'C' if not order: order = 'F' hasshape = hasattr(array, 'shape') if ensure_2d and hasshape: if len(array.shape) != 2: raise ValueError("Not 2D") if not allow_nd and hasshape: if len(array.shape) > 2: raise ValueError("More than 2 dimensions detected") if ensure_min_samples > 0 and hasshape: if array.shape[0] < ensure_min_samples: raise ValueError("Not enough samples") if ensure_min_features > 0 and hasshape and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: raise ValueError("Found array with %d feature(s) (shape=%s) while" " a minimum of %d is required." % (n_features, array.shape, ensure_min_features)) is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array) if is_sparse: check_sparse(array, accept_sparse, accept_large_sparse) if array.format == 'csr': new_array = gpu_csr_matrix(array, copy=copy) elif array.format == 'csc': new_array = gpu_csc_matrix(array, copy=copy) elif array.format == 'coo': new_array = gpu_coo_matrix(array, copy=copy) else: raise ValueError('Sparse matrix format not supported') check_finite(new_array.data, force_all_finite) if correct_dtype != new_array.dtype: new_array = new_array.astype(correct_dtype) return new_array else: X, n_rows, n_cols, dtype = input_to_cupy_array(array, order=order, deepcopy=copy, fail_on_null=False) if correct_dtype != dtype: X = X.astype(correct_dtype) check_finite(X, force_all_finite) return X
def issparse(X): return sp_sparse.issparse(X) or cu_sparse.issparse(X)