Esempio n. 1
0
    def _validate_input(self, X, in_fit):
        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = "allow-nan"
        X = self._validate_data(X,
                                reset=in_fit,
                                accept_sparse=('csc', 'csr'),
                                dtype=None,
                                force_all_finite=force_all_finite)
        _check_inputs_dtype(X, self.missing_values)
        if X.dtype.kind not in ("i", "u", "f", "O"):
            raise ValueError("MissingIndicator does not support data with "
                             "dtype {0}. Please provide either a numeric array"
                             " (with a floating point or integer dtype) or "
                             "categorical data represented either as an array "
                             "with integer dtype or an array of string values "
                             "with an object dtype.".format(X.dtype))

        if sparse.issparse(X) and self.missing_values == 0:
            # missing_values = 0 not allowed with sparse data as it would
            # force densification
            raise ValueError("Sparse input with missing_values=0 is "
                             "not supported. Provide a dense "
                             "array instead.")

        return X
Esempio n. 2
0
    def transform(self, X) -> SparseCumlArray:
        """Impute all missing values in X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data to complete.
        """
        check_is_fitted(self)

        X = self._validate_input(X, in_fit=False)
        X_indicator = super()._transform_indicator(X)

        statistics = self.statistics_

        if X.shape[1] != statistics.shape[0]:
            raise ValueError("X has %d features per sample, expected %d" %
                             (X.shape[1], self.statistics_.shape[0]))

        # Delete the invalid columns if strategy is not constant
        if self.strategy == "constant":
            valid_statistics = statistics
        else:
            # same as np.isnan but also works for object dtypes
            invalid_mask = _get_mask(statistics, np.nan)
            valid_mask = np.logical_not(invalid_mask)
            valid_statistics = statistics[valid_mask]
            valid_statistics_indexes = np.flatnonzero(valid_mask)

            if invalid_mask.any():
                missing = np.arange(X.shape[1])[invalid_mask]
                if self.verbose:
                    warnings.warn("Deleting features without "
                                  "observed values: %s" % missing)
                X = X[:, valid_statistics_indexes]

        # Do actual imputation
        if sparse.issparse(X):
            if self.missing_values == 0:
                raise ValueError("Imputation not possible when missing_values "
                                 "== 0 and input is sparse. Provide a dense "
                                 "array instead.")
            else:
                mask = _get_mask(X.data, self.missing_values)
                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
                                    np.diff(X.indptr).tolist())[mask]

                X.data[mask] = valid_statistics[indexes].astype(X.dtype,
                                                                copy=False)
        else:
            mask = _get_mask(X, self.missing_values)
            if self.strategy == "constant":
                X[mask] = valid_statistics[0]
            else:
                for i, vi in enumerate(valid_statistics_indexes):
                    feature_idxs = np.flatnonzero(mask[:, vi])
                    X[feature_idxs, vi] = valid_statistics[i]

        X = super()._concatenate_indicator(X, X_indicator)
        return X
Esempio n. 3
0
def row_norms(X, squared=False):
    """Row-wise (squared) Euclidean norm of X.

    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
    matrices.

    Performs no input validation.

    Parameters
    ----------
    X : array_like
        The input array
    squared : bool, optional (default = False)
        If True, return squared norms.

    Returns
    -------
    array_like
        The row-wise (squared) Euclidean norm of X.
    """
    if sparse.issparse(X):
        if isinstance(
                X, (sparse.csr_matrix, sparse.csc_matrix, sparse.coo_matrix)):
            X_copy = X.copy()
            X_copy.data = np.square(X_copy.data)
            norms = X_copy.sum(axis=1).squeeze()
        else:
            raise ValueError('Sparse matrix not compatible')
    else:
        norms = np.einsum('ij,ij->i', X, X)

    if not squared:
        np.sqrt(norms, norms)
    return norms
Esempio n. 4
0
def row_norms(X, squared=False):
    """Row-wise (squared) Euclidean norm of X.

    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
    matrices and does not create an X.shape-sized temporary.

    Performs no input validation.

    Parameters
    ----------
    X : array_like
        The input array
    squared : bool, optional (default = False)
        If True, return squared norms.

    Returns
    -------
    array_like
        The row-wise (squared) Euclidean norm of X.
    """
    if sparse.issparse(X):
        if not isinstance(X, sparse.csr_matrix):
            X = sparse.csr_matrix(X)
        # norms = csr_row_norms(X)
    else:
        norms = np.einsum('ij,ij->i', X, X)

    if not squared:
        np.sqrt(norms, norms)
    return norms
Esempio n. 5
0
def to_output_type(array, output_type, order='F'):
    """Used to convert arrays while creating datasets
    for testing.

    Parameters
    ----------
    array : array
        Input array to convert
    output_type : string
        Type of to convert to

    Returns
    -------
    Converted array
    """
    if output_type == 'scipy_csr':
        return cpu_sparse.csr_matrix(array.get())
    if output_type == 'scipy_csc':
        return cpu_sparse.csc_matrix(array.get())
    if output_type == 'scipy_coo':
        return cpu_sparse.coo_matrix(array.get())
    if output_type == 'cupy_csr':
        if array.format in ['csc', 'coo']:
            return array.tocsr()
        else:
            return array
    if output_type == 'cupy_csc':
        if array.format in ['csr', 'coo']:
            return array.tocsc()
        else:
            return array
    if output_type == 'cupy_coo':
        if array.format in ['csr', 'csc']:
            return array.tocoo()
        else:
            return array

    if cpu_sparse.issparse(array):
        if output_type == 'numpy':
            return array.todense()
        elif output_type == 'cupy':
            return cp.array(array.todense())
        else:
            array = array.todense()
    elif gpu_sparse.issparse(array):
        if output_type == 'numpy':
            return cp.asnumpy(array.todense())
        elif output_type == 'cupy':
            return array.todense()
        else:
            array = array.todense()

    cuml_array = input_to_cuml_array(array, order=order)[0]
    if output_type == 'series' and len(array.shape) > 1:
        output_type = 'cudf'

    return cuml_array.to_output(output_type)
Esempio n. 6
0
    def _get_missing_features_info(self, X):
        """Compute the imputer mask and the indices of the features
        containing missing values.

        Parameters
        ----------
        X : {ndarray or sparse matrix}, shape (n_samples, n_features)
            The input data with missing values. Note that ``X`` has been
            checked in ``fit`` and ``transform`` before to call this function.

        Returns
        -------
        imputer_mask : {ndarray or sparse matrix}, shape \
        (n_samples, n_features)
            The imputer mask of the original data.

        features_with_missing : ndarray, shape (n_features_with_missing)
            The features containing missing values.

        """
        if sparse.issparse(X):
            mask = _get_mask(X.data, self.missing_values)

            # The imputer mask will be constructed with the same sparse format
            # as X.
            sparse_constructor = (sparse.csr_matrix
                                  if X.format == 'csr' else sparse.csc_matrix)
            imputer_mask = sparse_constructor(
                (mask, X.indices.copy(), X.indptr.copy()),
                shape=X.shape,
                dtype=np.float32)
            # temporarly switch to using float32 as
            # cupy cannot operate with bool as of now

            if self.features == 'missing-only':
                n_missing = imputer_mask.sum(axis=0)

            if self.sparse is False:
                imputer_mask = imputer_mask.toarray()
            elif imputer_mask.format == 'csr':
                imputer_mask = imputer_mask.tocsc()
        else:
            imputer_mask = _get_mask(X, self.missing_values)

            if self.features == 'missing-only':
                n_missing = imputer_mask.sum(axis=0)

            if self.sparse is True:
                imputer_mask = sparse.csc_matrix(imputer_mask)

        if self.features == 'all':
            features_indices = np.arange(X.shape[1])
        else:
            features_indices = np.flatnonzero(n_missing)

        return imputer_mask, features_indices
Esempio n. 7
0
    def fit(self, X, y=None) -> "SimpleImputer":
        """Fit the imputer on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : SimpleImputer
        """

        if type(X) is list:
            X = np.asarray(X)

        X = self._validate_input(X, in_fit=True)
        super()._fit_indicator(X)

        # default fill_value is 0 for numerical input and "missing_value"
        # otherwise
        if self.fill_value is None:
            if X.dtype.kind in ("i", "u", "f"):
                fill_value = 0
            else:
                fill_value = "missing_value"
        else:
            fill_value = self.fill_value

        # fill_value should be numerical in case of numerical input
        if (self.strategy == "constant" and X.dtype.kind in ("i", "u", "f")
                and not isinstance(fill_value, numbers.Real)):
            raise ValueError("'fill_value'={0} is invalid. Expected a "
                             "numerical value when imputing numerical "
                             "data".format(fill_value))

        if sparse.issparse(X):
            # missing_values = 0 not allowed with sparse data as it would
            # force densification
            if self.missing_values == 0:
                raise ValueError("Imputation not possible when missing_values "
                                 "== 0 and input is sparse. Provide a dense "
                                 "array instead.")
            else:
                self.statistics_ = self._sparse_fit(X, self.strategy,
                                                    self.missing_values,
                                                    fill_value)
        else:
            self.statistics_ = self._dense_fit(X, self.strategy,
                                               self.missing_values, fill_value)
        return self
Esempio n. 8
0
    def _concatenate_indicator(self, X_imputed, X_indicator):
        """Concatenate indicator mask with the imputed data."""
        if not self.add_indicator:
            return X_imputed

        hstack = sparse.hstack if sparse.issparse(X_imputed) else np.hstack
        if X_indicator is None:
            raise ValueError(
                "Data from the missing indicator are not provided. Call "
                "_fit_indicator and _transform_indicator in the imputer "
                "implementation.")

        return hstack((X_imputed, X_indicator))
Esempio n. 9
0
def issparse(x):
    if cps and cps.issparse(x):
        # is cupy.sparse
        return True
    if sps and sps.issparse(x):
        # is scipy.sparse
        return True
    if np and isinstance(x, np.ndarray):
        return False
    if cp and isinstance(x, cp.ndarray):
        return False

    from .array import SparseNDArray
    return isinstance(x, SparseNDArray)
Esempio n. 10
0
def to_output_type(array, output_type, order='F'):
    if output_type == 'scipy_csr':
        return cpu_sparse.csr_matrix(array.get())
    if output_type == 'scipy_csc':
        return cpu_sparse.csc_matrix(array.get())
    if output_type == 'scipy_coo':
        return cpu_sparse.coo_matrix(array.get())
    if output_type == 'cupy_csr':
        if array.format in ['csc', 'coo']:
            return array.tocsr()
        else:
            return array
    if output_type == 'cupy_csc':
        if array.format in ['csr', 'coo']:
            return array.tocsc()
        else:
            return array
    if output_type == 'cupy_coo':
        if array.format in ['csr', 'csc']:
            return array.tocoo()
        else:
            return array

    if cpu_sparse.issparse(array):
        if output_type == 'numpy':
            return array.todense()
        elif output_type == 'cupy':
            return cp.array(array.todense())
        else:
            array = array.todense()
    elif gpu_sparse.issparse(array):
        if output_type == 'numpy':
            return cp.asnumpy(array.todense())
        elif output_type == 'cupy':
            return array.todense()
        else:
            array = array.todense()

    cuml_array = input_to_cuml_array(array, order=order)[0]
    if output_type == 'series' and len(array.shape) > 1:
        output_type = 'cudf'

    return cuml_array.to_output(output_type)
Esempio n. 11
0
def check_sparse(array, accept_sparse=False, accept_large_sparse=True):
    """Checks that the sparse array is valid

    Parameters
    ----------
    accept_sparse : string, boolean or list/tuple of strings (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.
    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.

    Returns
    -------
    None or raise error
    """
    if accept_sparse is True:
        return

    err_msg = "This algorithm does not support the sparse " + \
              "input in the current configuration."

    is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array)
    if is_sparse:
        if accept_sparse is False:
            raise ValueError(err_msg)

        if not accept_large_sparse:
            if array.indices.dtype != cp.int32 or \
               array.indptr.dtype != cp.int32:
                raise ValueError(err_msg)

        if isinstance(accept_sparse, (tuple, list)):
            if array.format not in accept_sparse:
                raise ValueError(err_msg)
        elif array.format != accept_sparse:
            raise ValueError(err_msg)
Esempio n. 12
0
def check_array(array,
                accept_sparse=False,
                accept_large_sparse=True,
                dtype='numeric',
                order=None,
                copy=False,
                force_all_finite=True,
                ensure_2d=True,
                allow_nd=False,
                ensure_min_samples=1,
                ensure_min_features=1,
                warn_on_dtype=None,
                estimator=None):
    """Input validation on an array, list, sparse matrix or similar.
    By default, the input is checked to be a non-empty 2D array containing
    only finite values. If the dtype of the array is object, attempt
    converting to float, raising on failure.

    Parameters
    ----------
    array : object
        Input object to check / convert.
    accept_sparse : string, boolean or list/tuple of strings (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.
    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.
    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.
    order : 'F', 'C' or None (default=None)
        Whether an array will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output array; otherwise (copy=True)
        the memory layout of the returned array is kept as close as possible
        to the original array.
    copy : boolean (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.
    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
        possibilities are:
        - True: Force all values of array to be finite.
        - False: accepts np.inf, np.nan, pd.NA in array.
        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
          cannot be infinite.
           ``force_all_finite`` accepts the string ``'allow-nan'``.
    ensure_2d : boolean (default=True)
        Whether to raise a value error if array is not 2D.
    allow_nd : boolean (default=False)
        Whether to allow array.ndim > 2.
    ensure_min_samples : int (default=1)
        Make sure that the array has a minimum number of samples in its first
        axis (rows for a 2D array). Setting to 0 disables this check.
    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.
    estimator : unused parameter

    Returns
    -------
    array_converted : object
        The converted and validated array.
    """

    if dtype == 'numeric':
        dtype = numeric_types

    correct_dtype = check_dtype(array, dtype)

    if copy and not order and hasattr(array, 'flags'):
        if array.flags['F_CONTIGUOUS']:
            order = 'F'
        elif array.flags['C_CONTIGUOUS']:
            order = 'C'

    if not order:
        order = 'F'

    hasshape = hasattr(array, 'shape')
    if ensure_2d and hasshape:
        if len(array.shape) != 2:
            raise ValueError("Not 2D")

    if not allow_nd and hasshape:
        if len(array.shape) > 2:
            raise ValueError("More than 2 dimensions detected")

    if ensure_min_samples > 0 and hasshape:
        if array.shape[0] < ensure_min_samples:
            raise ValueError("Not enough samples")

    if ensure_min_features > 0 and hasshape and array.ndim == 2:
        n_features = array.shape[1]
        if n_features < ensure_min_features:
            raise ValueError("Found array with %d feature(s) (shape=%s) while"
                             " a minimum of %d is required." %
                             (n_features, array.shape, ensure_min_features))

    is_sparse = cpu_sparse.issparse(array) or gpu_sparse.issparse(array)
    if is_sparse:
        check_sparse(array, accept_sparse, accept_large_sparse)
        if array.format == 'csr':
            new_array = gpu_csr_matrix(array, copy=copy)
        elif array.format == 'csc':
            new_array = gpu_csc_matrix(array, copy=copy)
        elif array.format == 'coo':
            new_array = gpu_coo_matrix(array, copy=copy)
        else:
            raise ValueError('Sparse matrix format not supported')
        check_finite(new_array.data, force_all_finite)
        if correct_dtype != new_array.dtype:
            new_array = new_array.astype(correct_dtype)
        return new_array
    else:
        X, n_rows, n_cols, dtype = input_to_cupy_array(array,
                                                       order=order,
                                                       deepcopy=copy,
                                                       fail_on_null=False)
        if correct_dtype != dtype:
            X = X.astype(correct_dtype)
        check_finite(X, force_all_finite)
        return X