Exemple #1
0
def test_data_generator_types(input_type):
    X, *_ = datagen.gen_data('blobs', input_type, n_samples=100, n_features=10)
    if input_type == 'numpy':
        assert isinstance(X, np.ndarray)
    elif input_type == 'cudf':
        assert isinstance(X, cudf.DataFrame)
    elif input_type == 'pandas':
        assert isinstance(X, pd.DataFrame)
    elif input_type == 'gpuarray':
        assert cuda.is_cuda_array(X)
    elif input_type == 'gpuarray-c':
        assert cuda.is_cuda_array(X)
    else:
        assert False
 def test_as_cuda_array(self):
     h_arr = np.arange(10)
     self.assertFalse(cuda.is_cuda_array(h_arr))
     d_arr = cuda.to_device(h_arr)
     self.assertTrue(cuda.is_cuda_array(d_arr))
     my_arr = ForeignArray(d_arr)
     self.assertTrue(cuda.is_cuda_array(my_arr))
     wrapped = cuda.as_cuda_array(my_arr)
     self.assertTrue(cuda.is_cuda_array(wrapped))
     # Their values must equal the original array
     np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr)
     np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr)
     # d_arr and wrapped must be the same buffer
     self.assertPointersEqual(wrapped, d_arr)
 def test_as_cuda_array(self):
     h_arr = np.arange(10)
     self.assertFalse(cuda.is_cuda_array(h_arr))
     d_arr = cuda.to_device(h_arr)
     self.assertTrue(cuda.is_cuda_array(d_arr))
     my_arr = MyArray(d_arr)
     self.assertTrue(cuda.is_cuda_array(my_arr))
     wrapped = cuda.as_cuda_array(my_arr)
     self.assertTrue(cuda.is_cuda_array(wrapped))
     # Their values must equal the original array
     np.testing.assert_array_equal(wrapped.copy_to_host(), h_arr)
     np.testing.assert_array_equal(d_arr.copy_to_host(), h_arr)
     # d_arr and wrapped must be the same buffer
     self.assertEqual(wrapped.device_ctypes_pointer.value,
                      d_arr.device_ctypes_pointer.value)
Exemple #4
0
def test_output_type_context_mgr(global_output_type, context_type):
    dataset = get_small_dataset('numba')

    test_type = 'cupy' if global_output_type != 'cupy' else 'numpy'
    cuml.set_global_output_type(test_type)

    # use cuml context manager
    with cuml.using_output_type(context_type):
        dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1)
        dbscan_float.fit(dataset)

        res = dbscan_float.labels_

        if context_type == 'numba':
            assert is_cuda_array(res)
        else:
            assert isinstance(res, test_output_types[context_type])

    # use cuml again outside the context manager

    dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1)
    dbscan_float.fit(dataset)

    res = dbscan_float.labels_
    assert isinstance(res, test_output_types[test_type])
Exemple #5
0
def _typecast_will_lose_information(X, target_dtype):
    """
    Returns True if typecast will cause information loss, else False.
    Handles float/float, float/int, and int/int typecasts.
    """
    target_dtype = np.dtype(target_dtype).type

    if target_dtype in (np.int8, np.int16, np.int32, np.int64):
        target_dtype_range = np.iinfo(target_dtype)
    else:
        target_dtype_range = np.finfo(target_dtype)

    if isinstance(X, (np.ndarray, cp.ndarray, pd.Series, cudf.Series)):
        if X.dtype.type == target_dtype:
            return False

        return (
            (X < target_dtype_range.min) |
            (X > target_dtype_range.max)
        ).any()

    elif isinstance(X, (pd.DataFrame, cudf.DataFrame)):
        X_m = X.values
        return _typecast_will_lose_information(X_m, target_dtype)

    elif cuda.is_cuda_array(X):
        X_m = cp.asarray(X)
        return _typecast_will_lose_information(X_m, target_dtype)

    else:
        raise TypeError("Received unsupported input type: %s" % type(X))
Exemple #6
0
def convert_dtype(X, to_dtype=np.float32):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = cp.asarray(X)
        X_m = X_m.astype(to_dtype)
        return cuda.as_cuda_array(X_m)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
Exemple #7
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`, raising a TypeError
    if the conversion would lose information.
    """
    would_lose_info = _typecast_will_lose_information(X, to_dtype)
    if would_lose_info:
        raise TypeError("Data type conversion would lose information.")

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            return X_m

    elif isinstance(X, (cudf.Series, cudf.DataFrame, pd.Series, pd.DataFrame)):
        return X.astype(to_dtype, copy=False)

    elif cuda.is_cuda_array(X):
        X_m = cp.asarray(X)
        X_m = X_m.astype(to_dtype, copy=False)

        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type: %s" % type(X))

    return X
Exemple #8
0
def convert_dtype(X, to_dtype=np.float32):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    # Using cuDF for converting numba and device array interface inputs
    # if CuPy not installed, temporary while CuPy conda package
    # causes nccl conflicts
    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        if has_cupy():
            import cupy as cp
            X_m = cp.asarray(X)
            X_m = X_m.astype(to_dtype)
            return cuda.as_cuda_array(X_m)
        else:
            warnings.warn("Using cuDF for dtype conversion, install"
                          "CuPy for faster data conversion.")
            if (len(X.shape) == 1):
                return cudf.Series(X).astype(to_dtype).to_gpu_array()
            else:
                X_df = cudf.DataFrame()
                X = X_df.from_gpu_matrix(X)
                X = convert_dtype(X, to_dtype=to_dtype)
                return X.as_gpu_matrix()

    elif isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if dtype != to_dtype:
            new_cols = [(col, X._cols[col].astype(to_dtype))
                        for col in X._cols]
            overflowed = sum([len(colval[colval >= np.inf])
                              for colname, colval in new_cols])

            if overflowed > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")

            return cudf.DataFrame(new_cols)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
Exemple #9
0
def test_default_global_output_type(input_type):
    dataset = get_small_dataset(input_type)

    dbscan_float = cuml.DBSCAN(eps=1.0, min_samples=1)
    dbscan_float.fit(dataset)

    res = dbscan_float.labels_

    if input_type == 'numba':
        assert is_cuda_array(res)
    else:
        assert isinstance(res, test_output_types[input_type])
Exemple #10
0
 def typeof_pyval(self, val):
     # Based on _DispatcherBase.typeof_pyval, but differs from it to support
     # the CUDA Array Interface.
     try:
         return typeof(val, Purpose.argument)
     except ValueError:
         if cuda.is_cuda_array(val):
             # When typing, we don't need to synchronize on the array's
             # stream - this is done when the kernel is launched.
             return typeof(cuda.as_cuda_array(val, sync=False),
                           Purpose.argument)
         else:
             raise
Exemple #11
0
def get_dtype(X):
    """
    Returns dtype of obj as a Numpy style dtype (like np.float32)
    """
    if isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
    elif (isinstance(X, cudf.Series)):
        dtype = np.dtype(X._column.dtype)
    elif isinstance(X, np.ndarray):
        dtype = X.dtype
    elif cuda.is_cuda_array(X):
        dtype = X.dtype
    elif cuda.devicearray.is_cuda_ndarray(X):
        dtype = X.dtype
    else:
        raise TypeError("Input object not understood for dtype detection.")

    return dtype
Exemple #12
0
def convert_dtype(X, to_dtype=np.float32, legacy=True):
    """
    Convert X to be of dtype `dtype`

    Supported float dtypes for overflow checking.
    Todo: support other dtypes if needed.
    """

    # temporarily importing here, until github issue #1681 reorganizing utils
    # is dealt with. Otherwise circular import causes issues
    from cuml.common import CumlArray

    if isinstance(X, np.ndarray):
        dtype = X.dtype
        if dtype != to_dtype:
            X_m = X.astype(to_dtype)
            if len(X[X == np.inf]) > 0:
                raise TypeError("Data type conversion resulted"
                                "in data loss.")
            return X_m

    elif isinstance(X, cudf.Series) or isinstance(X, cudf.DataFrame):
        return X.astype(to_dtype)

    elif cuda.is_cuda_array(X):
        X_m = rmm_cupy_ary(cp.asarray, X)
        X_m = X_m.astype(to_dtype)
        if legacy:
            return cuda.as_cuda_array(X_m)
        else:
            return CumlArray(data=X_m)

    else:
        raise TypeError("Received unsupported input type " % type(X))

    return X
Exemple #13
0
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(arbitrary.data,
                            arbitrary.dtype,
                            mask=arbitrary.mask,
                            categories=categories)

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype='int8')
            else:
                sbuf = np.empty(0, dtype='int8')
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype='int32')
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype='int8')

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if pd.api.types.is_categorical_dtype(dtype):
                    data = as_column(pd.Series(arbitrary, dtype='category'),
                                     nan_as_null=nan_as_null)
                else:
                    if dtype is None:
                        np_type = None
                    else:
                        np_type = np.dtype(dtype)
                    data = as_column(np.array(arbitrary, dtype=np_type),
                                     nan_as_null=nan_as_null)

    return data
Exemple #14
0
def input_to_dev_array(X,
                       order='F',
                       deepcopy=False,
                       check_dtype=False,
                       convert_to_dtype=False,
                       check_cols=False,
                       check_rows=False,
                       fail_on_order=False):
    """
    Convert input X to device array suitable for C++ methods.

    Acceptable input formats:

    * cuDF Dataframe - returns a deep copy always.
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`.
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless `deepcopy`=True.
    * numba device array - returns a reference unless deepcopy=True

    Parameters
    ----------

    X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: string (default: 'F')
        Whether to return a F-major or C-major array. Used to check the order
        of the input. If fail_on_order=True method will raise ValueError,
        otherwise it will convert X to be of order `order`.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.


    Returns
    -------
    `inp_array`: namedtuple('inp_array', 'array pointer n_rows n_cols dtype')

        A new device array if the input was not a numba device
        array. It is a reference to the input X if it was a numba device array
        or cuda array interface compliant (like cupy)

    """

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    if isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if order == 'F':
            X_m = X.as_gpu_matrix(order='F')
        elif order == 'C':
            X_m = cuml.utils.numba_utils.row_matrix(X)

    elif (isinstance(X, cudf.Series)):
        if deepcopy:
            X_m = X.to_gpu_array()
        else:
            if X.null_count == 0:
                # using __cuda_array_interface__ support of cudf.Series for
                # this temporarily while switching from rmm device_array to
                # rmm deviceBuffer https://github.com/rapidsai/cuml/issues/1379
                X_m = cuda.as_cuda_array(X._column)
            else:
                raise ValueError("Error: cuDF Series has missing/null values")

    elif isinstance(X, np.ndarray):
        dtype = X.dtype
        X_m = rmm.to_device(np.array(X, order=order, copy=False))

    elif cuda.is_cuda_array(X):
        # Use cuda array interface to create a device array by reference
        X_m = cuda.as_cuda_array(X)

        if deepcopy:
            out_dev_array = rmm.device_array_like(X_m)
            out_dev_array.copy_to_device(X_m)
            X_m = out_dev_array

    elif cuda.devicearray.is_cuda_ndarray(X):
        if deepcopy:
            out_dev_array = rmm.device_array_like(X)
            out_dev_array.copy_to_device(X)
            X_m = out_dev_array
        else:
            X_m = X

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    dtype = X_m.dtype

    if check_dtype:
        if isinstance(check_dtype, type) or isinstance(check_dtype, np.dtype):
            if dtype != check_dtype:
                del X_m
                raise TypeError("Expected " + str(check_dtype) + "input but" +
                                " got " + str(dtype) + " instead.")
        elif isinstance(check_dtype, Collection) and \
                not isinstance(check_dtype, str):
            # The 'not isinstance(check_dtype, string)' condition is needed,
            # because the 'float32' string is a Collection, but in this
            # branch we only want to process collections like
            # [np.float32, np.float64].
            if dtype not in check_dtype:
                del X_m
                raise TypeError("Expected input to be of type in " +
                                str(check_dtype) + " but got " + str(dtype))
        else:
            raise ValueError("Expected a type as check_dtype arg, but got " +
                             str(check_dtype))

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if not check_numba_order(X_m, order):
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order)
            X_m = cuda.as_cuda_array(X_m)

    X_ptr = get_dev_array_ptr(X_m)

    return inp_array(array=X_m,
                     pointer=X_ptr,
                     n_rows=n_rows,
                     n_cols=n_cols,
                     dtype=dtype)
Exemple #15
0
def input_to_host_array(X,
                        order='F',
                        deepcopy=False,
                        check_dtype=False,
                        convert_to_dtype=False,
                        check_cols=False,
                        check_rows=False,
                        fail_on_order=False):
    """
    Convert input X to host array (NumPy) suitable for C++ methods that accept
    host arrays.

    Acceptable input formats:

    * Numpy array - returns a pointer to the original input

    * cuDF Dataframe - returns a deep copy always

    * cuDF Series - returns by reference or a deep copy depending on `deepcopy`

    * cuda array interface compliant array (like Cupy) - returns a \
        reference unless deepcopy=True

    * numba device array - returns a reference unless deepcopy=True

    Parameters
        ----------

    X:
        cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: string (default: 'F')
        Whether to return a F-major or C-major array. Used to check the order
        of the input. If fail_on_order=True method will raise ValueError,
        otherwise it will convert X to be of order `order`.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.


    Returns
    -------
    `inp_array`: namedtuple('inp_array', 'array pointer n_rows n_cols dtype')

    `inp_array` is a new device array if the input was not a NumPy device
        array. It is a reference to the input X if it was a NumPy host array
    """

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    if isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if order == 'F':
            X_m = X.as_gpu_matrix(order='F')
        elif order == 'C':
            X_m = cuml.utils.numba_utils.row_matrix(X)
        X_m = X_m.copy_to_host()

    elif (isinstance(X, cudf.Series)):
        if X.null_count == 0:
            X_m = X.to_array()
        else:
            raise ValueError('cuDF Series has missing (null) values.')

    elif isinstance(X, np.ndarray):
        X_m = np.array(X, order=order, copy=deepcopy)

    elif cuda.is_cuda_array(X):
        # Use cuda array interface to create a device array by reference
        X_m = cuda.as_cuda_array(X)
        X_m = np.array(X_m.copy_to_host(), order=order)

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    dtype = X_m.dtype

    if check_dtype:
        if isinstance(check_dtype, type):
            if dtype != check_dtype:
                del X_m
                raise TypeError("Expected " + str(check_dtype) + "input but" +
                                " got " + str(dtype) + " instead.")
        elif isinstance(check_dtype, Collection):
            if dtype not in check_dtype:
                del X_m
                raise TypeError("Expected input to be of type in " +
                                str(check_dtype) + " but got " + str(dtype))

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    X_ptr = X_m.ctypes.data

    return inp_array(array=X_m,
                     pointer=X_ptr,
                     n_rows=n_rows,
                     n_cols=n_cols,
                     dtype=dtype)
Exemple #16
0
def train_test_split(X,
                     y,
                     test_size: Union[float, int] = None,
                     train_size: Union[float, int] = None,
                     shuffle: bool = True,
                     random_state: Union[int, cp.random.RandomState,
                                         np.random.RandomState] = None,
                     seed: Union[int, cp.random.RandomState,
                                 np.random.RandomState] = None):
    """
    Partitions device data into four collated objects, mimicking
    Scikit-learn's `train_test_split`

    Parameters
    ----------
    X : cudf.DataFrame or cuda_array_interface compliant device array
        Data to split, has shape (n_samples, n_features)
    y : str, cudf.Series or cuda_array_interface compliant device array
        Set of labels for the data, either a series of shape (n_samples) or
        the string label of a column in X (if it is a cuDF DataFrame)
        containing the labels
    train_size : float or int, optional
        If float, represents the proportion [0, 1] of the data
        to be assigned to the training set. If an int, represents the number
        of instances to be assigned to the training set. Defaults to 0.8
    shuffle : bool, optional
        Whether or not to shuffle inputs before splitting
    random_state : int, CuPy RandomState or NumPy RandomState optional
        If shuffle is true, seeds the generator. Unseeded by default
    seed: random_state : int, CuPy RandomState or NumPy RandomState optional
        Deprecated in favor of `random_state`.
        If shuffle is true, seeds the generator. Unseeded by default

    Examples
    --------
    .. code-block:: python

        import cudf
        from cuml.preprocessing.model_selection import train_test_split

        # Generate some sample data
        df = cudf.DataFrame({'x': range(10),
                             'y': [0, 1] * 5})
        print(f'Original data: {df.shape[0]} elements')

        # Suppose we want an 80/20 split
        X_train, X_test, y_train, y_test = train_test_split(df, 'y',
                                                            train_size=0.8)
        print(f'X_train: {X_train.shape[0]} elements')
        print(f'X_test: {X_test.shape[0]} elements')
        print(f'y_train: {y_train.shape[0]} elements')
        print(f'y_test: {y_test.shape[0]} elements')

        # Alternatively, if our labels are stored separately
        labels = df['y']
        df = df.drop(['y'])

        # we can also do
        X_train, X_test, y_train, y_test = train_test_split(df, labels,
                                                            train_size=0.8)

    Output:

    .. code-block:: python

        Original data: 10 elements
        X_train: 8 elements
        X_test: 2 elements
        y_train: 8 elements
        y_test: 2 elements

    Returns
    -------
    X_train, X_test, y_train, y_test : cudf.DataFrame
        Partitioned dataframes. If `y` was provided as a column name, the
        column was dropped from the `X`s
    """
    if isinstance(y, str):
        # Use the column with name `str` as y
        if isinstance(X, cudf.DataFrame):
            name = y
            y = X[name]
            X = X.drop(name)
        else:
            raise TypeError("X needs to be a cuDF Dataframe when y is a \
                             string")

    # todo: this check will be replaced with upcoming improvements
    # to input_utils with PR #1379
    if not cuda.is_cuda_array(X) and not isinstance(X, cudf.DataFrame) \
            and isinstance(y, cudf.Series):
        raise TypeError("X needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if not cuda.is_cuda_array(y) and not isinstance(y, cudf.DataFrame) \
            and isinstance(y, cudf.Series):
        raise TypeError("y needs to be either a cuDF DataFrame, Series or \
                        a cuda_array_interface compliant array.")

    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must have the same first dimension"
                         "(found {} and {})".format(X.shape[0], y.shape[0]))

    if isinstance(train_size, float):
        if not 0 <= train_size <= 1:
            raise ValueError("proportion train_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(train_size, int):
        if not 0 <= train_size <= X.shape[0]:
            raise ValueError(
                "Number of instances train_size should be between 0 and the"
                "first dimension of X (found {})".format(train_size))

    if isinstance(test_size, float):
        if not 0 <= test_size <= 1:
            raise ValueError("proportion test_size should be between"
                             "0 and 1 (found {})".format(train_size))

    if isinstance(test_size, int):
        if not 0 <= test_size <= X.shape[0]:
            raise ValueError(
                "Number of instances test_size should be between 0 and the"
                "first dimension of X (found {})".format(test_size))

    x_numba = False
    y_numba = False

    if seed is not None:
        if random_state is None:
            warnings.warn("Parameter 'seed' is deprecated, please use \
                          'random_state' instead.")
            random_state = seed
        else:
            warnings.warn("Both 'seed' and 'random_state' parameters were \
                          set, using 'random_state' since 'seed' is \
                          deprecated. ")

    if shuffle:
        if random_state is None or isinstance(random_state, int):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])
            random_state = cp.random.RandomState(seed=random_state)

        elif isinstance(random_state, cp.random.RandomState):
            idxs = rmm_cupy_ary(cp.arange, X.shape[0])

        elif isinstance(random_state, np.random.RandomState):
            idxs = np.arange(X.shape[0])

        else:
            raise TypeError("`random_state` must be an int, NumPy RandomState \
                             or CuPy RandomState.")

        random_state.shuffle(idxs)

        if isinstance(X, cudf.DataFrame) or isinstance(X, cudf.Series):
            X = X.iloc[idxs].reset_index(drop=True)

        elif cuda.is_cuda_array(X):
            # numba (and therefore rmm device_array) does not support
            # fancy indexing
            if cuda.devicearray.is_cuda_ndarray(X):
                x_numba = True
            X = cp.asarray(X)[idxs]

        if isinstance(y, cudf.DataFrame) or isinstance(y, cudf.Series):
            y = y.iloc[idxs]

        elif cuda.is_cuda_array(y):
            if cuda.devicearray.is_cuda_ndarray(y):
                y_numba = True
            y = cp.asarray(y)[idxs]

    # Determining sizes of splits
    if isinstance(train_size, float):
        train_size = int(X.shape[0] * train_size)

    if test_size is None:
        if train_size is None:
            train_size = int(X.shape[0] * 0.75)

        test_size = X.shape[0] - train_size

    if isinstance(test_size, float):
        test_size = int(X.shape[0] * test_size)
        if train_size is None:
            train_size = X.shape[0] - test_size

    elif isinstance(test_size, int):
        if train_size is None:
            train_size = X.shape[0] - test_size

    if cuda.is_cuda_array(X) or isinstance(X, cp.sparse.csr_matrix):
        X_train = X[0:train_size]
        y_train = y[0:train_size]
    elif isinstance(X, cudf.DataFrame):
        X_train = X.iloc[0:train_size]
        y_train = y.iloc[0:train_size]

    if cuda.is_cuda_array(y) or isinstance(X, cp.sparse.csr_matrix):
        X_test = X[-1 * test_size:]
        y_test = y[-1 * test_size:]
    elif isinstance(y, cudf.DataFrame):
        X_test = X.iloc[-1 * test_size:]
        y_test = y.iloc[-1 * test_size:]

    if x_numba:
        X_train = cuda.as_cuda_array(X_train)
        X_test = cuda.as_cuda_array(X_test)

    if y_numba:
        y_train = cuda.as_cuda_array(y_train)
        y_test = cuda.as_cuda_array(y_test)

    return X_train, X_test, y_train, y_test
Exemple #17
0
 def is_device_array(self, obj):
     return cuda.is_cuda_array(obj)
Exemple #18
0
def input_to_cuml_array(X,
                        order='F',
                        deepcopy=False,
                        check_dtype=False,
                        convert_to_dtype=False,
                        check_cols=False,
                        check_rows=False,
                        fail_on_order=False):
    """
    Convert input X to CumlArray.

    Acceptable input formats:

    * cuDF Dataframe - returns a deep copy always.
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`.
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless `deepcopy`=True.
    * numba device array - returns a reference unless deepcopy=True

    Parameters
    ----------

    X : cuDF.DataFrame, cuDF.Series, numba array, NumPy array or any
        cuda_array_interface compliant array like CuPy or pytorch.

    order: string (default: 'F')
        Whether to return a F-major or C-major array. Used to check the order
        of the input. If fail_on_order=True method will raise ValueError,
        otherwise it will convert X to be of order `order`.

    deepcopy: boolean (default: False)
        Set to True to always return a deep copy of X.

    check_dtype: np.dtype (default: False)
        Set to a np.dtype to throw an error if X is not of dtype `check_dtype`.

    convert_to_dtype: np.dtype (default: False)
        Set to a dtype if you want X to be converted to that dtype if it is
        not that dtype already.

    check_cols: int (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    check_rows: boolean (default: False)
        Set to an int `i` to check that input X has `i` columns. Set to False
        (default) to not check at all.

    fail_on_order: boolean (default: False)
        Set to True if you want the method to raise a ValueError if X is not
        of order `order`.

    Returns
    -------
    `cuml_array`: namedtuple('cuml_array', 'array n_rows n_cols dtype')

        A new CumlArray and associated data.

    """

    # temporarily importing here, until github issue #1681 reorganizing utils
    # is dealt with. Otherwise circular import causes issues
    from cuml.common import CumlArray

    # dtype conversion

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    # format conversion

    if (isinstance(X, cudf.Series)):
        if X.null_count != 0:
            raise ValueError("Error: cuDF Series has missing/null values, " +
                             " which are not supported by cuML.")

    if isinstance(X, cudf.DataFrame):
        if order == 'F':
            X_m = CumlArray(data=X.as_gpu_matrix(order='F'))
        elif order == 'C':
            X_m = CumlArray(data=cuml.utils.numba_utils.row_matrix(X))

    elif cuda.is_cuda_array(X) or isinstance(X, np.ndarray):
        X_m = CumlArray(data=X)

        if deepcopy:
            X_m = copy.deepcopy(X_m)

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    if check_dtype:
        if not isinstance(check_dtype, list):
            check_dtype = [check_dtype]

        check_dtype = [np.dtype(dtype) for dtype in check_dtype]

        if X_m.dtype not in check_dtype:
            type_str = X_m.dtype
            del X_m
            raise TypeError("Expected input to be of type in " +
                            str(check_dtype) + " but got " + str(type_str))

    # Checks based on parameters

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if X_m.order != order:
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            X_m = rmm_cupy_ary(cp.array, X_m, copy=False, order=order)
            X_m = CumlArray(data=X_m)

    return cuml_array(array=X_m, n_rows=n_rows, n_cols=n_cols, dtype=X_m.dtype)
Exemple #19
0
 def is_device_array(self, obj):
     return cuda.is_cuda_array(obj)
Exemple #20
0
def input_to_dev_array(X,
                       order='F',
                       deepcopy=False,
                       check_dtype=False,
                       convert_to_dtype=False,
                       check_cols=False,
                       check_rows=False,
                       fail_on_order=False):
    """
    Convert input X to device array suitable for C++ methods
    Acceptable input formats:
    * cuDF Dataframe - returns a deep copy always
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless deepcopy=True
    * numba device array - returns a reference unless deepcopy=True

    Returns: namedtuple('dev_array', 'array pointer n_rows n_cols dtype')

    `dev_array` is a new device array if the input was not a numba device
        array. It is a reference to the input X if it was a numba device array
        or cuda array interface compliant (like cupy)
    """

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    if isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if order == 'F':
            X_m = X.as_gpu_matrix(order='F')
        elif order == 'C':
            X_m = cuml.utils.numba_utils.row_matrix(X)

    elif (isinstance(X, cudf.Series)):
        if deepcopy:
            X_m = X.to_gpu_array()
        else:
            if X.null_count == 0:
                X_m = X._column._data.mem
            else:
                raise ValueError("Error: cuDF Series has missing/null values")

    elif isinstance(X, np.ndarray):
        dtype = X.dtype
        X_m = rmm.to_device(np.array(X, order=order, copy=False))

    elif cuda.is_cuda_array(X):
        # Use cuda array interface to create a device array by reference
        X_m = cuda.as_cuda_array(X)

        if deepcopy:
            out_dev_array = rmm.device_array_like(X_m)
            out_dev_array.copy_to_device(X_m)
            X_m = out_dev_array

    elif cuda.devicearray.is_cuda_ndarray(X):
        if deepcopy:
            out_dev_array = rmm.device_array_like(X)
            out_dev_array.copy_to_device(X)
            X_m = out_dev_array
        else:
            X_m = X

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    dtype = X_m.dtype

    if check_dtype:
        if dtype != check_dtype:
            del X_m
            raise TypeError("Expected " + str(check_dtype) + "input but got " +
                            str(dtype) + " instead.")

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if not check_numba_order(X_m, order):
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            cuml.utils.numba_utils.gpu_major_converter(X_m,
                                                       n_rows,
                                                       n_cols,
                                                       dtype,
                                                       to_order=order)

    X_ptr = get_dev_array_ptr(X_m)

    result = namedtuple('dev_array', 'array pointer n_rows n_cols dtype')

    return result(array=X_m,
                  pointer=X_ptr,
                  n_rows=n_rows,
                  n_cols=n_cols,
                  dtype=dtype)