Esempio n. 1
0
    def fit(self, y):
        """
        Fit label binarizer

        Parameters
        ----------
        y : array of shape [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """

        self._set_output_type(y)

        if y.ndim > 2:
            raise ValueError("labels cannot be greater than 2 dimensions")

        if y.ndim == 2:

            unique_classes = cp.unique(y)
            if unique_classes != [0, 1]:
                raise ValueError("2-d array can must be binary")

            self._classes_ = CumlArray(cp.arange(0, y.shape[1]))
        else:
            self._classes_ = CumlArray(cp.unique(y).astype(y.dtype))

        cp.cuda.Stream.null.synchronize()

        return self
Esempio n. 2
0
    def _count(self, X, Y):
        """
        Sum feature counts & class prior counts and add to current model.

        Parameters
        ----------
        X : cupy.ndarray or cupyx.scipy.sparse matrix of size
                  (n_rows, n_features)
        Y : cupy.array of monotonic class labels
        """

        if X.ndim != 2:
            raise ValueError("Input samples should be a 2D array")

        if Y.dtype != self.classes_.dtype:
            warnings.warn("Y dtype does not match classes_ dtype. Y will be "
                          "converted, which will increase memory consumption")

        counts = cp.zeros((self._n_classes_, self._n_features_),
                          order="F",
                          dtype=X.dtype)

        class_c = cp.zeros(self._n_classes_, order="F", dtype=X.dtype)

        n_rows = X.shape[0]
        n_cols = X.shape[1]

        labels_dtype = self.classes_.dtype

        if cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()

            count_features_coo = count_features_coo_kernel(
                X.dtype, labels_dtype)
            count_features_coo((math.ceil(X.nnz / 32), ), (32, ),
                               (counts, X.row, X.col, X.data, X.nnz, n_rows,
                                n_cols, Y, self._n_classes_, False))

        else:

            count_features_dense = count_features_dense_kernel(
                X.dtype, labels_dtype)
            count_features_dense(
                (math.ceil(n_rows / 32), math.ceil(n_cols / 32), 1),
                (32, 32, 1), (counts, X, n_rows, n_cols, Y, self._n_classes_,
                              False, X.flags["C_CONTIGUOUS"]))

        count_classes = count_classes_kernel(X.dtype, labels_dtype)
        count_classes((math.ceil(n_rows / 32), ), (32, ), (class_c, n_rows, Y))

        self._feature_count_ = CumlArray(self._feature_count_ + counts)
        self._class_count_ = CumlArray(self._class_count_ + class_c)
Esempio n. 3
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = rmm_cupy_ary(cp.asarray, X)
        X_m = X_m.astype(to_dtype)
        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type: %s" % type(X))

    return X
Esempio n. 4
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`, raising a TypeError
    if the conversion would lose information.
    """
    would_lose_info = _typecast_will_lose_information(X, to_dtype)
    if would_lose_info:
        raise TypeError("Data type conversion would lose information.")

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            return X_m

    elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)):
        return X.astype(to_dtype, copy=False)

    elif cuda.is_cuda_array(X):
        X_m = cp.asarray(X)
        X_m = X_m.astype(to_dtype, copy=False)

        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type: %s" % type(X))

    return X
Esempio n. 5
0
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False):
    rand_mat = (cp.random.rand(nrows, ncols) * 10)
    rand_mat = cp.array(rand_mat, dtype=dtype, order=order)

    if type == 'numpy':
        result = np.array(cp.asnumpy(rand_mat), order=order)

    if type == 'cupy':
        result = rand_mat

    if type == 'numba':
        result = nbcuda.as_cuda_array(rand_mat)

    if type == 'cudf':
        result = cudf.DataFrame(rand_mat)

    if type == 'pandas':
        result = pdDF(cp.asnumpy(rand_mat))

    if type == 'cuml':
        result = CumlArray(data=rand_mat)

    if out_dtype:
        return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype),
                                order=order)
    else:
        return result, np.array(cp.asnumpy(rand_mat), order=order)
Esempio n. 6
0
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        """
        out_type = self._get_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        jll = self._joint_log_likelihood(X)
        indices = cp.argmax(jll, axis=1).astype(self.classes_.dtype)

        y_hat = invert_labels(indices, classes=self.classes_)
        return CumlArray(data=y_hat).to_output(out_type)
Esempio n. 7
0
def get_input(type, nrows, ncols, dtype, order='C', out_dtype=False):
    rand_mat = (cp.random.rand(nrows, ncols) * 10)
    rand_mat = cp.array(rand_mat, order=order).astype(dtype)

    if type == 'numpy':
        result = np.array(cp.asnumpy(rand_mat), order=order)

    if type == 'cupy':
        result = rand_mat

    if type == 'numba':
        result = nbcuda.as_cuda_array(rand_mat)

    if type == 'cudf':
        result = cudf.DataFrame()
        result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat))

    if type == 'pandas':
        result = cudf.DataFrame()
        result = result.from_gpu_matrix(nbcuda.as_cuda_array(rand_mat))
        result = result.to_pandas()

    if type == 'cuml':
        result = CumlArray(data=rand_mat,
                           dtype=dtype,
                           shape=rand_mat.shape,
                           order=order if order != 'K' else None)

    if out_dtype:
        return result, np.array(cp.asnumpy(rand_mat).astype(out_dtype),
                                order=order)
    else:
        return result, np.array(cp.asnumpy(rand_mat), order=order)
Esempio n. 8
0
    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.

        """
        out_type = self._get_output_type(X)
        result = cp.exp(self.predict_log_proba(X))
        return CumlArray(result).to_output(out_type)
Esempio n. 9
0
    def _update_class_log_prior(self, class_prior=None):

        if class_prior is not None:

            if class_prior.shape[0] != self._n_classes_:
                raise ValueError("Number of classes must match "
                                 "number of priors")

            self._class_log_prior_ = cp.log(class_prior)

        elif self.fit_prior:
            log_class_count = cp.log(self._class_count_)
            self._class_log_prior_ = \
                CumlArray(log_class_count - cp.log(
                    cp.asarray(self._class_count_).sum()))
        else:
            self._class_log_prior_ = CumlArray(
                cp.full(self._n_classes_, -1 * math.log(self._n_classes_)))
Esempio n. 10
0
    def _update_feature_log_prob(self, alpha):
        """
        Apply add-lambda smoothing to raw counts and recompute
        log probabilities

        Parameters
        ----------

        alpha : float amount of smoothing to apply (0. means no smoothing)
        """
        smoothed_fc = cp.asarray(self._feature_count_) + alpha
        smoothed_cc = smoothed_fc.sum(axis=1).reshape(-1, 1)
        self._feature_log_prob_ = CumlArray(
            cp.log(smoothed_fc) - cp.log(smoothed_cc.reshape(-1, 1)))
Esempio n. 11
0
    def _partial_fit(self, X, y, sample_weight=None, _classes=None):
        self._set_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        y = input_to_cuml_array(y).array.to_output('cupy')

        Y, label_classes = make_monotonic(y, copy=True)

        if not self.fit_called_:
            self.fit_called_ = True
            if _classes is not None:
                _classes, *_ = input_to_cuml_array(_classes, order='K')
                check_labels(Y, _classes.to_output('cupy'))
                self._classes_ = _classes
            else:
                self._classes_ = CumlArray(data=label_classes)

            self._n_classes_ = self.classes_.shape[0]
            self._n_features_ = X.shape[1]
            self._init_counters(self._n_classes_, self._n_features_, X.dtype)
        else:
            check_labels(Y, self._classes_)

        self._count(X, Y)

        self._update_feature_log_prob(self.alpha)
        self._update_class_log_prior(class_prior=self._class_prior_)

        return self
Esempio n. 12
0
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        """
        out_type = self._get_output_type(X)

        if has_scipy():
            from scipy.sparse import isspmatrix as scipy_sparse_isspmatrix
        else:
            from cuml.common.import_utils import dummy_function_always_false \
                as scipy_sparse_isspmatrix

        # todo: use a sparse CumlArray style approach when ready
        # https://github.com/rapidsai/cuml/issues/2216
        if scipy_sparse_isspmatrix(X) or cupyx.scipy.sparse.isspmatrix(X):
            X = X.tocoo()
            rows = cp.asarray(X.row, dtype=X.row.dtype)
            cols = cp.asarray(X.col, dtype=X.col.dtype)
            data = cp.asarray(X.data, dtype=X.data.dtype)
            X = cupyx.scipy.sparse.coo_matrix((data, (rows, cols)),
                                              shape=X.shape)
        else:
            X = input_to_cuml_array(X, order='K').array.to_output('cupy')

        jll = self._joint_log_likelihood(X)

        # normalize by P(X) = P(f_1, ..., f_n)

        # Compute log(sum(exp()))

        # Subtract max in exp to prevent inf
        a_max = cp.amax(jll, axis=1, keepdims=True)

        exp = cp.exp(jll - a_max)
        logsumexp = cp.log(cp.sum(exp, axis=1))

        a_max = cp.squeeze(a_max, axis=1)

        log_prob_x = a_max + logsumexp

        if log_prob_x.ndim < 2:
            log_prob_x = log_prob_x.reshape((1, log_prob_x.shape[0]))
        result = jll - log_prob_x.T
        return CumlArray(result).to_output(out_type)
Esempio n. 13
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    # temporarily importing here, until github issue #1681 reorganizing utils
    # is dealt with. Otherwise circular import causes issues
    from cuml.common import CumlArray

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = rmm_cupy_ary(cp.asarray, X)
        X_m = X_m.astype(to_dtype)
        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
Esempio n. 14
0
def input_to_cuml_array(X,
                        order='F',
                        deepcopy=False,
                        check_dtype=False,
                        convert_to_dtype=False,
                        check_cols=False,
                        check_rows=False,
                        fail_on_order=False):
    """
    Convert input X to CumlArray.

    Acceptable input formats:

    * cuDF Dataframe - returns a deep copy always.
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`.
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless `deepcopy`=True.
    * numba device array - returns a reference unless deepcopy=True

    Parameters
    ----------

    X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: 'F', 'C' or 'K' (default: 'F')
        Whether to return a F-major ('F'),  C-major ('C') array or Keep ('K')
        the order of X. Used to check the order of the input. If
        fail_on_order=True, the method will raise ValueError,
        otherwise it will convert X to be of order `order` if needed.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.

    Returns
    -------
    `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype')

        A new CumlArray and associated data.

    """

    # dtype conversion

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    # format conversion

    if (isinstance(X, cudf.Series)):
        if X.null_count != 0:
            raise ValueError("Error: cuDF Series has missing/null values, " +
                             " which are not supported by cuML.")

    # converting pandas to numpy before sending it to CumlArray
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        # pandas doesn't support custom order in to_numpy
        X = cp.asarray(X.to_numpy(copy=False), order=order)

    if isinstance(X, cudf.DataFrame):
        if order == 'K':
            X_m = CumlArray(data=X.as_gpu_matrix(order='F'))
        else:
            X_m = CumlArray(data=X.as_gpu_matrix(order=order))

    elif isinstance(X, CumlArray):
        X_m = X

    elif hasattr(X, "__array_interface__") or \
            hasattr(X, "__cuda_array_interface__"):
        X_m = CumlArray(data=X)

        if deepcopy:
            X_m = copy.deepcopy(X_m)

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    if check_dtype:
        if not isinstance(check_dtype, list):
            check_dtype = [check_dtype]

        check_dtype = [np.dtype(dtype) for dtype in check_dtype]

        if X_m.dtype not in check_dtype:
            type_str = X_m.dtype
            del X_m
            raise TypeError("Expected input to be of type in " +
                            str(check_dtype) + " but got " + str(type_str))

    # Checks based on parameters

    n_rows = X_m.shape[0]

    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if n_cols == 1 or n_rows == 1:
        order = 'K'

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if order != 'K' and X_m.order != order:
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order)
            X_m = CumlArray(data=X_m)

    return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
Esempio n. 15
0
 def _init_counters(self, n_effective_classes, n_features, dtype):
     self._class_count_ = CumlArray.zeros(n_effective_classes,
                                          order="F",
                                          dtype=dtype)
     self._feature_count_ = CumlArray.zeros(
         (n_effective_classes, n_features), order="F", dtype=dtype)