コード例 #1
0
def test_validity_add(dtype, nelem):
    expect_fn = np.add
    test_fn = libgdf.gdf_add_generic

    # data
    h_lhs = gen_rand(dtype, nelem)
    h_rhs = gen_rand(dtype, nelem)
    d_lhs = rmm.to_device(h_lhs)
    d_rhs = rmm.to_device(h_rhs)
    d_result = rmm.device_array_like(d_lhs)

    # valids
    h_lhs_valids = gen_rand(np.int8, (nelem + 8 - 1) // 8)
    h_rhs_valids = gen_rand(np.int8, (nelem + 8 - 1) // 8)

    d_lhs_valids = rmm.to_device(h_lhs_valids)
    d_rhs_valids = rmm.to_device(h_rhs_valids)
    d_result_valids = rmm.device_array_like(d_lhs_valids)

    # columns
    col_lhs = new_column()
    col_rhs = new_column()
    col_result = new_column()
    gdf_dtype = get_dtype(dtype)

    libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs),
                           unwrap_devary(d_lhs_valids), nelem, gdf_dtype)
    libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs),
                           unwrap_devary(d_rhs_valids), nelem, gdf_dtype)
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result),
                           unwrap_devary(d_result_valids), nelem, gdf_dtype)

    libgdf.gdf_validity_and(col_lhs, col_rhs, col_result)

    expect = expect_fn(h_lhs, h_rhs)
    test_fn(col_lhs, col_rhs, col_result)
    got = d_result.copy_to_host()

    # Ensure validity mask is matching
    expect_valids = h_lhs_valids & h_rhs_valids
    got_valids = d_result_valids.copy_to_host()

    np.testing.assert_array_equal(expect_valids, got_valids)

    # Masked data
    mask = buffer_as_bits(expect_valids.data)[:expect.size]
    expect_masked = expect[mask]
    got_masked = got[mask]

    print('expect')
    print(expect_masked)
    print('got')
    print(got_masked)

    np.testing.assert_array_equal(expect_masked, got_masked)
コード例 #2
0
ファイル: test_binaryops.py プロジェクト: Quansight/pygdf
def test_lhs_rhs_dtype_mismatch():
    lhs_dtype = np.int32
    rhs_dtype = np.float32
    nelem = 5
    h_lhs = np.arange(nelem, dtype=lhs_dtype)
    h_rhs = np.arange(nelem, dtype=rhs_dtype)

    d_lhs = rmm.to_device(h_lhs)
    d_rhs = rmm.to_device(h_rhs)
    d_result = rmm.device_array_like(d_lhs)

    col_lhs = new_column()
    col_rhs = new_column()
    col_result = new_column()

    libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem,
                           get_dtype(lhs_dtype))
    libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem,
                           get_dtype(rhs_dtype))
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, get_dtype(lhs_dtype))

    with pytest.raises(GDFError) as raises:
        libgdf.gdf_add_generic(col_lhs, col_rhs, col_result)
    raises.match("GDF_UNSUPPORTED_DTYPE")

    with pytest.raises(GDFError) as raises:
        libgdf.gdf_eq_generic(col_lhs, col_rhs, col_result)
    raises.match("GDF_UNSUPPORTED_DTYPE")

    with pytest.raises(GDFError) as raises:
        libgdf.gdf_bitwise_and_generic(col_lhs, col_rhs, col_result)
    raises.match("GDF_UNSUPPORTED_DTYPE")
コード例 #3
0
    def deserialize(cls, deserialize, header, frames):
        """Called when dask.distributed is performing a deserialization for
        data of this class.

        Do not use this directly.  It is invoked by dask.distributed.

        Parameters
        ----------

        deserialize : callable
             Used to deserialize data that needs further deserialization .
        header, frames : dict
            See custom serialization documentation in dask.distributed.

        Returns
        -------
        obj : Buffer
            Returns an instance of Buffer.
        """
        # Using IPC?
        if header['kind'] == 'ipc':
            ipch = deserialize(header['mem'], frames)
            # Open IPC handle
            with ipch as data:
                # Copy remote data over
                mem = rmm.device_array_like(data)
                mem.copy_to_device(data)
        # Not using IPC
        else:
            # Deserialize the numpy array
            mem = deserialize(header['mem'], frames)
            mem.flags['WRITEABLE'] = True  # XXX: hack for numba to work
        return Buffer(mem)
コード例 #4
0
ファイル: cudautils.py プロジェクト: zeichuan/cudf
def find_last(arr, val, compare="eq"):
    """
    Returns the index of the last occurrence of *val* in *arr*.
    Or the last occurence of *arr* *compare* *val*, if *compare* is not eq
    Otherwise, returns -1.

    Parameters
    ----------
    arr : device array
    val : scalar
    compare: str ('gt', 'lt', or 'eq' (default))
    """
    found = rmm.device_array_like(arr)
    if found.size > 0:
        if compare == "gt":
            gpu_mark_gt.forall(found.size)(arr, val, found, -1)
        elif compare == "lt":
            gpu_mark_lt.forall(found.size)(arr, val, found, -1)
        else:
            if arr.dtype in ("float32", "float64"):
                gpu_mark_found_float.forall(found.size)(arr, val, found, -1)
            else:
                gpu_mark_found_int.forall(found.size)(arr, val, found, -1)
    from cudf.dataframe.columnops import as_column

    found_col = as_column(found)
    max_index = found_col.max()
    return max_index
コード例 #5
0
ファイル: cudautils.py プロジェクト: xincui-math/cudf
def fillna(data, mask, value):
    out = rmm.device_array_like(data)
    out.copy_to_device(data)
    if data.size > 0:
        configured = gpu_fill_masked.forall(data.size)
        configured(value, mask, out)
    return out
コード例 #6
0
ファイル: test_binaryops.py プロジェクト: Quansight/pygdf
def bitwise_op_test(dtype, expect_fn, test_fn, nelem=128):
    h_lhs = gen_rand(dtype, nelem)
    h_rhs = gen_rand(dtype, nelem)

    d_lhs = rmm.to_device(h_lhs)
    d_rhs = rmm.to_device(h_rhs)
    d_result = rmm.device_array_like(d_lhs)

    col_lhs = new_column()
    col_rhs = new_column()
    col_result = new_column()
    gdf_dtype = get_dtype(dtype)

    libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem,
                           gdf_dtype)
    libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem,
                           gdf_dtype)
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, gdf_dtype)

    expect = expect_fn(h_lhs, h_rhs)
    test_fn(col_lhs, col_rhs, col_result)
    got = d_result.copy_to_host()
    print('got')
    print(got)
    print('expect')
    print(expect)
    np.testing.assert_array_equal(expect, got)
コード例 #7
0
def math_op_test(dtype,
                 ulp,
                 expect_fn,
                 test_fn,
                 nelem=128,
                 scale=1,
                 positive_only=False):
    randvals = gen_rand(dtype, nelem, positive_only=positive_only)
    h_data = (randvals * scale).astype(dtype)
    d_data = rmm.to_device(h_data)
    d_result = rmm.device_array_like(d_data)

    col_data = new_column()
    col_result = new_column()
    gdf_dtype = get_dtype(dtype)

    # data column
    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           gdf_dtype)
    # result column
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, gdf_dtype)

    expect = expect_fn(h_data)
    test_fn(col_data, col_result)

    got = d_result.copy_to_host()

    print('got')
    print(got)
    print('expect')
    print(expect)
    np.testing.assert_array_max_ulp(expect, got, maxulp=ulp)
コード例 #8
0
ファイル: test_binaryops.py プロジェクト: Quansight/pygdf
def arith_op_test(dtype,
                  ulp,
                  expect_fn,
                  test_fn,
                  nelem=128,
                  non_zero_rhs=False):
    h_lhs = gen_rand(dtype, nelem)
    h_rhs = gen_rand(dtype, nelem)
    if non_zero_rhs:
        fix_zeros(h_rhs)
    d_lhs = rmm.to_device(h_lhs)
    d_rhs = rmm.to_device(h_rhs)
    d_result = rmm.device_array_like(d_lhs)

    col_lhs = new_column()
    col_rhs = new_column()
    col_result = new_column()
    gdf_dtype = get_dtype(dtype)

    libgdf.gdf_column_view(col_lhs, unwrap_devary(d_lhs), ffi.NULL, nelem,
                           gdf_dtype)
    libgdf.gdf_column_view(col_rhs, unwrap_devary(d_rhs), ffi.NULL, nelem,
                           gdf_dtype)
    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem, gdf_dtype)

    expect = expect_fn(h_lhs, h_rhs)
    test_fn(col_lhs, col_rhs, col_result)
    got = d_result.copy_to_host()
    print('got')
    print(got)
    print('expect')
    print(expect)
    np.testing.assert_array_max_ulp(expect, got, maxulp=ulp)
コード例 #9
0
def _request_transfer(key, remoteinfo):
    logger.info("rebuild from: %s for %r", remoteinfo, key)

    context = zmq.Context()
    socket = context.socket(zmq.REQ)
    socket.connect("tcp://{0}:{1}".format(*remoteinfo))

    myaddr = _global_addr[0]
    theiraddr = remoteinfo[0]
    if myaddr == theiraddr:
        # Same machine go by IPC
        logger.info("request by IPC")
        socket.send(pickle.dumps(("IPC", key)))
        rcv = socket.recv()
        ipch = pickle.loads(rcv)
        # Open IPC and copy to local context

        with ipch as data:
            copied = rmm.device_array_like(data)
            copied.copy_to_device(data)

        # Release
        _request_drop(socket, key)
        return copied
    else:
        # Different machine go by NET
        logger.info("request by NET: %s->%s", theiraddr, myaddr)
        socket.send(pickle.dumps(("NET", key)))
        rcv = socket.recv()
        output = rmm.to_device(pickle.loads(rcv))
        # Release
        _request_drop(socket, key)
        return output
コード例 #10
0
ファイル: cudautils.py プロジェクト: Quansight/pygdf
def copy_array(arr, out=None):
    if out is None:
        out = rmm.device_array_like(arr)
    assert out.size == arr.size
    if arr.is_c_contiguous() and out.is_c_contiguous():
        out.copy_to_device(arr)
    else:
        gpu_copy.forall(out.size)(arr, out)
    return out
コード例 #11
0
ファイル: cudautils.py プロジェクト: Quansight/pygdf
def recode(data, recode_table, na_value):
    """Recode data with the given recode table.
    And setting out-of-range values to *na_value*
    """
    newdata = rmm.device_array_like(data)
    recode_table = to_device(recode_table)
    blksz = 32 * 4
    blkct = min(16, max(1, data.size // blksz))
    gpu_recode[blkct, blksz](newdata, data, recode_table, na_value)
    return newdata
コード例 #12
0
ファイル: cudautils.py プロジェクト: Quansight/pygdf
 def run(self, arr, k):
     if k >= MAX_FAST_UNIQUE_K:
         raise NotImplementedError('k >= {}'.format(MAX_FAST_UNIQUE_K))
     # setup mem
     outsz_ptr = rmm.device_array(shape=1, dtype=np.intp)
     out = rmm.device_array_like(arr)
     # kernel
     self._kernel[1, 64](arr, k, out, outsz_ptr)
     # copy to host
     unique_ct = outsz_ptr.copy_to_host()[0]
     if unique_ct < 0:
         raise ValueError('too many unique value (hint: increase k)')
     else:
         hout = out.copy_to_host()
         return hout[:unique_ct]
コード例 #13
0
ファイル: test_rmm.py プロジェクト: shwina/cudf
def test_rmm_alloc(dtype, nelem):
    # data
    h_in = gen_rand(dtype, nelem)
    h_result = gen_rand(dtype, nelem)
    
    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)
    h_result = d_result.copy_to_host()

    print('expect')
    print(h_in)
    print('got')
    print(h_result)

    np.testing.assert_array_equal(h_result, h_in)
コード例 #14
0
ファイル: test_rmm.py プロジェクト: lucafuji/rmm
def array_tester(dtype, nelem):
    # data
    h_in = np.full(nelem, 3.2, dtype)
    h_result = np.empty(nelem, dtype)

    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)
    h_result = d_result.copy_to_host()

    print('expect')
    print(h_in)
    print('got')
    print(h_result)

    np.testing.assert_array_equal(h_result, h_in)
コード例 #15
0
def array_tester(dtype, nelem):
    # data
    h_in = np.full(nelem, 3.2, dtype)
    h_result = np.empty(nelem, dtype)

    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)
    h_result = d_result.copy_to_host()

    print('expect')
    print(h_in)
    print('got')
    print(h_result)

    np.testing.assert_array_equal(h_result, h_in)
コード例 #16
0
ファイル: test_rmm.py プロジェクト: shwina/cudf
def test_rmm_csv_log():
    dtype = np.int32
    nelem=1024

    # data
    h_in = gen_rand(dtype, nelem)
    h_result = gen_rand(dtype, nelem)
    
    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)
    h_result = d_result.copy_to_host()

    csv=rmm.csv_log()

    print(csv[:1000])

    assert(csv.find("Event Type,Device ID,Address,Stream,Size (bytes),Free Memory,Total Memory,Current Allocs,Start,End,Elapsed") >= 0)
コード例 #17
0
def test_col_mismatch_error():
    nelem = 128
    h_data = np.random.random(nelem).astype(np.float32)
    d_data = rmm.to_device(h_data)
    d_result = rmm.device_array_like(d_data)

    col_data = new_column()
    col_result = new_column()

    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           libgdf.GDF_FLOAT32)

    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem + 10, libgdf.GDF_FLOAT32)

    with pytest.raises(GDFError) as excinfo:
        libgdf.gdf_sin_generic(col_data, col_result)

    assert 'GDF_COLUMN_SIZE_MISMATCH' == str(excinfo.value)
コード例 #18
0
def test_rmm_csv_log():
    dtype = np.int32
    nelem = 1024

    # data
    h_in = np.full(nelem, 3.2, dtype)

    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)

    csv = rmm.csv_log()

    print(csv[:1000])

    assert (csv.find("Event Type,Device ID,Address,Stream,Size (bytes),"
                     "Free Memory,Total Memory,Current Allocs,Start,End,"
                     "Elapsed,Location") >= 0)
コード例 #19
0
def test_unsupported_dtype_error():
    nelem = 128
    h_data = np.random.random(nelem).astype(np.float32)
    d_data = rmm.to_device(h_data)
    d_result = rmm.device_array_like(d_data)

    col_data = new_column()
    col_result = new_column()

    libgdf.gdf_column_view(col_data, unwrap_devary(d_data), ffi.NULL, nelem,
                           libgdf.GDF_INT32)

    libgdf.gdf_column_view(col_result, unwrap_devary(d_result), ffi.NULL,
                           nelem + 10, libgdf.GDF_FLOAT32)

    with pytest.raises(GDFError) as excinfo:
        libgdf.gdf_sin_generic(col_data, col_result)

    assert 'GDF_UNSUPPORTED_DTYPE' == str(excinfo.value)
コード例 #20
0
ファイル: test_rmm.py プロジェクト: lucafuji/rmm
def test_rmm_csv_log():
    dtype = np.int32
    nelem = 1024

    # data
    h_in = np.full(nelem, 3.2, dtype)

    d_in = rmm.to_device(h_in)
    d_result = rmm.device_array_like(d_in)

    d_result.copy_to_device(d_in)

    csv = rmm.csv_log()

    print(csv[:1000])

    assert(csv.find("Event Type,Device ID,Address,Stream,Size (bytes),"
                    "Free Memory,Total Memory,Current Allocs,Start,End,"
                    "Elapsed,Location") >= 0)
コード例 #21
0
ファイル: cudautils.py プロジェクト: yutiansut/cudf
def find_last(arr, val):
    """
    Returns the index of the last occurrence of *val* in *arr*.
    Otherwise, returns -1.

    Parameters
    ----------
    arr : device array
    val : scalar
    """
    found = rmm.device_array_like(arr)
    if found.size > 0:
        if arr.dtype in ('float32', 'float64'):
            gpu_mark_found_float.forall(found.size)(arr, val, found, -1)
        else:
            gpu_mark_found_int.forall(found.size)(arr, val, found, -1)
    from cudf.dataframe.columnops import as_column
    found_col = as_column(found)
    max_index = found_col.max()
    return max_index
コード例 #22
0
ファイル: cudautils.py プロジェクト: xincui-math/cudf
def fill_mask(data, mask, value):
    """fill a column with the same value using a custom mask

    Parameters
    ----------
    data : device array
        data
    mask : device array
        validity mask
    value : scale
        fill value

    Returns
    -------
    device array
        mask filled column with scalar value
    """

    out = rmm.device_array_like(data)
    out.copy_to_device(data)
    if data.size > 0:
        configured = gpu_fill_masked.forall(data.size)
        configured(value, mask, out)
    return out
コード例 #23
0
ファイル: columnops.py プロジェクト: xincui-math/cudf
def as_column(arbitrary, nan_as_null=True, dtype=None):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * cuda array interface
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - DatetimeColumn for datetime input
        - NumericalColumn for all other inputs.
    """
    from cudf.dataframe import numerical, categorical, datetime, string
    from cudf.dataframe.series import Series
    from cudf.dataframe.index import Index

    if isinstance(arbitrary, Column):
        categories = None
        if hasattr(arbitrary, "categories"):
            categories = arbitrary.categories
        data = build_column(arbitrary.data,
                            arbitrary.dtype,
                            mask=arbitrary.mask,
                            categories=categories)

    elif isinstance(arbitrary, Series):
        data = arbitrary._column

    elif isinstance(arbitrary, Index):
        data = arbitrary._values

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif isinstance(arbitrary, nvstrings.nvstrings):
        data = string.StringColumn(data=arbitrary)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            if nan_as_null:
                mask = cudautils.mask_from_devary(arbitrary)
                data = data.set_mask(mask)

    elif cuda.is_cuda_array(arbitrary):
        # Use cuda array interface to do create a numba device array by
        # reference
        new_dev_array = cuda.as_cuda_array(arbitrary)

        # Allocate new output array using rmm and copy the numba device array
        # to an rmm owned device array
        out_dev_array = rmm.device_array_like(new_dev_array)
        out_dev_array.copy_to_device(new_dev_array)

        data = as_column(out_dev_array)

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags['C_CONTIGUOUS']:
            arbitrary = np.ascontiguousarray(arbitrary)
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        elif arbitrary.dtype.kind in ('O', 'U'):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            count = len(arbitrary)
            null_count = arbitrary.null_count

            buffers = arbitrary.buffers()
            # Buffer of actual strings values
            if buffers[2] is not None:
                sbuf = np.frombuffer(buffers[2], dtype='int8')
            else:
                sbuf = np.empty(0, dtype='int8')
            # Buffer of offsets values
            obuf = np.frombuffer(buffers[1], dtype='int32')
            # Buffer of null bitmask
            nbuf = None
            if null_count > 0:
                nbuf = np.frombuffer(buffers[0], dtype='int8')

            data = as_column(
                nvstrings.from_offsets(sbuf,
                                       obuf,
                                       count,
                                       nbuf=nbuf,
                                       ncount=null_count))
        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = dtype
            if (type(dtype) == str and dtype == 'empty') or dtype is None:
                new_dtype = np.dtype(arbitrary.type.to_pandas_dtype())

            if pd.api.types.is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]')
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())
            pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            pamask, padata = buffers_from_pyarrow(arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != 'empty':
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = 'category'
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = Column._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if pd.api.types.is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.array(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.array(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary))
        except TypeError:
            try:
                pa_type = None
                if dtype is not None:
                    if pd.api.types.is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type)
                data = as_column(pa.array(arbitrary,
                                          type=pa_type,
                                          from_pandas=nan_as_null),
                                 nan_as_null=nan_as_null)
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                np_type = None
                if pd.api.types.is_categorical_dtype(dtype):
                    data = as_column(pd.Series(arbitrary, dtype='category'),
                                     nan_as_null=nan_as_null)
                else:
                    if dtype is None:
                        np_type = None
                    else:
                        np_type = np.dtype(dtype)
                    data = as_column(np.array(arbitrary, dtype=np_type),
                                     nan_as_null=nan_as_null)

    return data
コード例 #24
0
ファイル: cudautils.py プロジェクト: yutiansut/cudf
def apply_round(data, decimal):
    output_dary = rmm.device_array_like(data)
    if output_dary.size > 0:
        gpu_round.forall(output_dary.size)(data, output_dary, decimal)
    return output_dary
コード例 #25
0
ファイル: input_utils.py プロジェクト: snailTolight/cuml
def input_to_dev_array(X,
                       order='F',
                       deepcopy=False,
                       check_dtype=False,
                       convert_to_dtype=False,
                       check_cols=False,
                       check_rows=False,
                       fail_on_order=False):
    """
    Convert input X to device array suitable for C++ methods
    Acceptable input formats:
    * cuDF Dataframe - returns a deep copy always
    * cuDF Series - returns by reference or a deep copy depending on
        `deepcopy`
    * Numpy array - returns a copy in device always
    * cuda array interface compliant array (like Cupy) - returns a
        reference unless deepcopy=True
    * numba device array - returns a reference unless deepcopy=True

    Returns: namedtuple('dev_array', 'array pointer n_rows n_cols dtype')

    `dev_array` is a new device array if the input was not a numba device
        array. It is a reference to the input X if it was a numba device array
        or cuda array interface compliant (like cupy)
    """

    if convert_to_dtype:
        X = convert_dtype(X, to_dtype=convert_to_dtype)
        check_dtype = False

    if isinstance(X, cudf.DataFrame):
        dtype = np.dtype(X[X.columns[0]]._column.dtype)
        if order == 'F':
            X_m = X.as_gpu_matrix(order='F')
        elif order == 'C':
            X_m = cuml.utils.numba_utils.row_matrix(X)

    elif (isinstance(X, cudf.Series)):
        if deepcopy:
            X_m = X.to_gpu_array()
        else:
            if X.null_count == 0:
                X_m = X._column._data.mem
            else:
                raise ValueError("Error: cuDF Series has missing/null values")

    elif isinstance(X, np.ndarray):
        dtype = X.dtype
        X_m = rmm.to_device(np.array(X, order=order, copy=False))

    elif cuda.is_cuda_array(X):
        # Use cuda array interface to create a device array by reference
        X_m = cuda.as_cuda_array(X)

        if deepcopy:
            out_dev_array = rmm.device_array_like(X_m)
            out_dev_array.copy_to_device(X_m)
            X_m = out_dev_array

    elif cuda.devicearray.is_cuda_ndarray(X):
        if deepcopy:
            out_dev_array = rmm.device_array_like(X)
            out_dev_array.copy_to_device(X)
            X_m = out_dev_array
        else:
            X_m = X

    else:
        msg = "X matrix format " + str(X.__class__) + " not supported"
        raise TypeError(msg)

    dtype = X_m.dtype

    if check_dtype:
        if dtype != check_dtype:
            del X_m
            raise TypeError("Expected " + str(check_dtype) + "input but got " +
                            str(dtype) + " instead.")

    n_rows = X_m.shape[0]
    if len(X_m.shape) > 1:
        n_cols = X_m.shape[1]
    else:
        n_cols = 1

    if check_cols:
        if n_cols != check_cols:
            raise ValueError("Expected " + str(check_cols) +
                             " columns but got " + str(n_cols) + " columns.")

    if check_rows:
        if n_rows != check_rows:
            raise ValueError("Expected " + str(check_rows) + " rows but got " +
                             str(n_rows) + " rows.")

    if not check_numba_order(X_m, order):
        if fail_on_order:
            raise ValueError("Expected " + order_to_str(order) +
                             " major order, but got the opposite.")
        else:
            warnings.warn("Expected " + order_to_str(order) + " major order, "
                          "but got the opposite. Converting data, this will "
                          "result in additional memory utilization.")
            cuml.utils.numba_utils.gpu_major_converter(X_m,
                                                       n_rows,
                                                       n_cols,
                                                       dtype,
                                                       to_order=order)

    X_ptr = get_dev_array_ptr(X_m)

    result = namedtuple('dev_array', 'array pointer n_rows n_cols dtype')

    return result(array=X_m,
                  pointer=X_ptr,
                  n_rows=n_rows,
                  n_cols=n_cols,
                  dtype=dtype)