def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) str_dtype = mem_dtype out_dtype = mem_dtype if mem_dtype.type in (np.int8, np.int16): mem_dtype = np.dtype(np.int32) str_dtype = mem_dtype elif mem_dtype.type is np.datetime64: kwargs.update(units=np.datetime_data(mem_dtype)[0]) mem_dtype = np.dtype(np.int64) out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype) out_ptr = get_ctype_ptr(out_arr) kwargs.update({"devptr": out_ptr}) _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs) out_col = columnops.as_column(out_arr) if self.null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype="int8") out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) mask = Buffer(out_mask_arr) out_col = out_col.set_mask(mask) return out_col.astype(out_dtype)
def __init__(self, data, null_count=None, **kwargs): """ Parameters ---------- data : nvstrings.nvstrings The nvstrings object null_count : int; optional The number of null values in the mask. """ from collections.abc import Sequence if isinstance(data, Sequence): data = nvstrings.to_device(data) assert isinstance(data, nvstrings.nvstrings) self._data = data self._dtype = np.dtype("object") if null_count is None: null_count = data.null_count() self._null_count = null_count self._mask = None if self._null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype='int8') out_mask_ptr = get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) self._mask = Buffer(out_mask_arr) self._nvcategory = None self._indices = None
def _mask_from_cuda_array_interface_desc(obj): from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize from cudf.utils.cudautils import compact_mask_bytes desc = obj.__cuda_array_interface__ mask = desc.get("mask", None) if mask is not None: desc = mask.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": nelem = calc_chunk_size(nelem, mask_bitsize) mask = Buffer( data=ptr, size=nelem * mask_dtype.itemsize, owner=obj ) elif typecode == "b": dtype = np.dtype(typestr) mask = compact_mask_bytes( rmm.device_array_from_ptr( ptr, nelem=nelem, dtype=dtype, finalizer=None ) ) mask = Buffer(mask) else: raise NotImplementedError( f"Cannot infer mask from typestr {typestr}" ) return mask
def serialize(self): header = {"null_count": self._null_count} header["type"] = pickle.dumps(type(self)) frames = [] sub_headers = [] sbuf = rmm.device_array(self._data.byte_count(), dtype="int8") obuf = rmm.device_array(len(self._data) + 1, dtype="int32") mask_size = utils.calc_chunk_size(len(self._data), utils.mask_bitsize) nbuf = rmm.device_array(mask_size, dtype="int8") self.data.to_offsets( get_ctype_ptr(sbuf), get_ctype_ptr(obuf), nbuf=get_ctype_ptr(nbuf), bdevmem=True, ) for item in [nbuf, sbuf, obuf]: sheader = item.__cuda_array_interface__.copy() sheader["dtype"] = item.dtype.str sub_headers.append(sheader) frames.append(item) header["nvstrings"] = len(self._data) header["subheaders"] = sub_headers return header, frames
def make_device_arrays(array): buffers = array.buffers() dtypes = [np.dtype(np.int8), None, None] if pa.types.is_list(array.type): dtypes[1] = np.dtype(np.int32) elif pa.types.is_string(array.type) or pa.types.is_binary(array.type): dtypes[2] = np.dtype(np.int8) dtypes[1] = np.dtype(np.int32) elif not pa.types.is_dictionary(array.type): dtypes[1] = arrow_to_pandas_dtype(array.type) else: dtypes[1] = arrow_to_pandas_dtype(array.type.index_type) if buffers[0] is not None: buf = CudaBuffer.from_buffer(buffers[0]) nbytes = min(buf.size, calc_chunk_size(len(array), mask_bitsize)) buffers[0] = gpu_view_as(nbytes, buf, dtypes[0]) for i in range(1, len(buffers)): if buffers[i] is not None: buf = CudaBuffer.from_buffer(buffers[i]) nbytes = min(buf.size, len(array) * dtypes[i].itemsize) buffers[i] = gpu_view_as(nbytes, buf, dtypes[i]) return buffers
def to_arrow(self): sbuf = np.empty(self.nvstrings.byte_count(), dtype="int8") obuf = np.empty(len(self.nvstrings) + 1, dtype="int32") mask_size = utils.calc_chunk_size( len(self.nvstrings), utils.mask_bitsize ) nbuf = np.empty(mask_size, dtype="int8") self.str().to_offsets(sbuf, obuf, nbuf=nbuf) sbuf = pa.py_buffer(sbuf) obuf = pa.py_buffer(obuf) nbuf = pa.py_buffer(nbuf) if self.null_count == len(self): return pa.NullArray.from_buffers( pa.null(), len(self), [pa.py_buffer((b""))], self.null_count ) else: return pa.StringArray.from_buffers( len(self.nvstrings), obuf, sbuf, nbuf, self.nvstrings.null_count(), )
def allocate_mask(self, all_valid=True): """Return a new Column with a newly allocated mask buffer. If ``all_valid`` is True, the new mask is set to all valid. If ``all_valid`` is False, the new mask is set to all null. """ nelem = len(self) mask_sz = utils.calc_chunk_size(nelem, utils.mask_bitsize) mask = cuda.device_array(mask_sz, dtype=utils.mask_dtype) cudautils.fill_value(mask, 0xff if all_valid else 0) return self.set_mask(mask=mask, null_count=0 if all_valid else nelem)
def random_bitmask(size): """ Parameters ---------- size : int number of bits """ sz = utils.calc_chunk_size(size, utils.mask_bitsize) data = np.random.randint(0, 255 + 1, size=sz) return data.astype(utils.mask_dtype)
def mask_array_view(self): """ View the mask as a device array """ result = rmm.device_array_from_ptr( ptr=self.mask.ptr, nelem=calc_chunk_size(len(self), mask_bitsize), dtype=np.int8, ) result.gpu_data._obj = self return result
def cffi_view_to_column_mem(cffi_view): gdf_dtype = cffi_view.dtype if gdf_dtype == libgdf.GDF_STRING_CATEGORY: data_ptr = int(ffi.cast("uintptr_t", cffi_view.data)) # We need to create this just to make sure the memory is properly freed data = rmm.device_array_from_ptr(data_ptr, nelem=cffi_view.size, dtype='int32', finalizer=rmm._make_finalizer( data_ptr, 0)) nvcat_ptr = int(ffi.cast("uintptr_t", cffi_view.dtype_info.category)) nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() mask = None if cffi_view.valid: mask_ptr = int(ffi.cast("uintptr_t", cffi_view.valid)) mask = rmm.device_array_from_ptr( mask_ptr, nelem=calc_chunk_size(cffi_view.size, mask_bitsize), dtype=mask_dtype, finalizer=rmm._make_finalizer(mask_ptr, 0)) return nvstr_obj, mask else: intaddr = int(ffi.cast("uintptr_t", cffi_view.data)) data = rmm.device_array_from_ptr( intaddr, nelem=cffi_view.size, dtype=gdf_to_np_dtype(cffi_view.dtype), finalizer=rmm._make_finalizer(intaddr, 0)) mask = None if cffi_view.valid: intaddr = int(ffi.cast("uintptr_t", cffi_view.valid)) mask = rmm.device_array_from_ptr( intaddr, nelem=calc_chunk_size(cffi_view.size, mask_bitsize), dtype=mask_dtype, finalizer=rmm._make_finalizer(intaddr, 0)) return data, mask
def to_arrow(self): sbuf = np.empty(self._data.byte_count(), dtype='int8') obuf = np.empty(len(self._data) + 1, dtype='int32') mask_size = utils.calc_chunk_size(len(self._data), utils.mask_bitsize) nbuf = np.empty(mask_size, dtype='int8') self.str().to_offsets(sbuf, obuf, nbuf=nbuf) sbuf = pa.py_buffer(sbuf) obuf = pa.py_buffer(obuf) nbuf = pa.py_buffer(nbuf) if self.null_count == len(self): return pa.NullArray.from_buffers(pa.null(), len(self), np.empty(0), self.null_count) else: return pa.StringArray.from_buffers(len(self._data), obuf, sbuf, nbuf, self._data.null_count())
def cffi_view_to_column_mem(cffi_view): intaddr = int(ffi.cast("uintptr_t", cffi_view.data)) data = rmm.device_array_from_ptr(intaddr, nelem=cffi_view.size, dtype=gdf_to_np_dtype(cffi_view.dtype), finalizer=rmm._make_finalizer(intaddr, 0)) if cffi_view.valid: intaddr = int(ffi.cast("uintptr_t", cffi_view.valid)) mask = rmm.device_array_from_ptr( intaddr, nelem=calc_chunk_size(cffi_view.size, mask_bitsize), dtype=mask_dtype, finalizer=rmm._make_finalizer(intaddr, 0)) else: mask = None return data, mask
def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) str_dtype = mem_dtype out_dtype = mem_dtype if mem_dtype.type in (np.int8, np.int16): mem_dtype = np.dtype(np.int32) str_dtype = mem_dtype elif mem_dtype.type is np.datetime64: kwargs.update(units=np.datetime_data(mem_dtype)[0]) mem_dtype = np.dtype(np.int64) if "format" not in kwargs: if len(self.nvstrings) > 0: # infer on host from the first not na element fmt = pd.core.tools.datetimes._guess_datetime_format( self[self.notna()][0] ) kwargs.update(format=fmt) else: fmt = None out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype) out_ptr = libcudf.cudf.get_ctype_ptr(out_arr) kwargs.update({"devptr": out_ptr}) _str_to_numeric_typecast_functions[str_dtype](self.nvstrings, **kwargs) out_col = column.as_column(out_arr) if self.has_nulls: mask_size = utils.calc_chunk_size( len(self.nvstrings), utils.mask_bitsize ) out_mask = column.column_empty( mask_size, dtype="int8", masked=False ).data out_mask_ptr = out_mask.ptr self.nvstrings.set_null_bitmask(out_mask_ptr, bdevmem=True) out_col.mask = out_mask return out_col.astype(out_dtype)
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None): """Create a Column from an arbitrary object Parameters ---------- arbitrary : object Object to construct the Column from. See *Notes*. nan_as_null : bool,optional If True (default), treat NaN values in arbitrary as null. dtype : optional Optionally typecast the construted Column to the given dtype. length : int, optional If `arbitrary` is a scalar, broadcast into a Column of the given length. Returns ------- A Column of the appropriate type and size. Notes ----- Currently support inputs are: * ``Column`` * ``Series`` * ``Index`` * Scalars (can be broadcasted to a specified `length`) * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects """ from cudf.core.column import numerical, categorical, datetime, string from cudf.core.series import Series from cudf.core.index import Index if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, nvstrings.nvstrings): byte_count = arbitrary.byte_count() if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Cannot construct string columns " "containing > {} bytes. " "Consider using dask_cudf to partition " "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR) ) sbuf = Buffer.empty(arbitrary.byte_count()) obuf = Buffer.empty( (arbitrary.size() + 1) * np.dtype("int32").itemsize ) nbuf = None if arbitrary.null_count() > 0: mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize) nbuf = Buffer.empty(mask_size) arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True) arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True) children = ( build_column(obuf, dtype="int32"), build_column(sbuf, dtype="int8"), ) data = build_column( data=None, dtype="object", mask=nbuf, children=children ) data._nvstrings = arbitrary elif isinstance(arbitrary, Buffer): if dtype is None: raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer") data = build_column(arbitrary, dtype=dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype) if ( data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0 ): if nan_as_null: mask = libcudf.unaryops.nans_to_nulls(data) data = data.set_mask(mask) elif data.dtype.kind == "M": null = column_empty_like(data, masked=True, newsize=1) col = libcudf.replace.replace( as_column(Buffer(arbitrary), dtype=arbitrary.dtype), as_column( Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)), dtype=arbitrary.dtype, ), null, ) data = datetime.DatetimeColumn( data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask ) elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(arbitrary) mask = _mask_from_cuda_array_interface_desc(arbitrary) dtype = np.dtype(desc["typestr"]) col = build_column(data, dtype=dtype, mask=mask) return col elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( arbitrary ) children = ( build_column(data=obuf, dtype="int32"), build_column(data=sbuf, dtype="int8"), ) data = string.StringColumn( mask=nbuf, children=children, size=pa_size, offset=pa_offset ) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype() ) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary),), dtype=new_dtype ) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary),), dtype=new_dtype ) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): codes = as_column(arbitrary.indices) if isinstance(arbitrary.dictionary, pa.NullArray): categories = as_column([], dtype="object") else: categories = as_column(arbitrary.dictionary) dtype = CategoricalDtype( categories=categories, ordered=arbitrary.type.ordered ) data = categorical.CategoricalColumn( dtype=dtype, mask=codes.base_mask, children=(codes,), size=codes.size, offset=codes.offset, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype="M8[ms]" ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=np.dtype("M8[ms]"), size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = numerical.NumericalColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) else: pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary ) data = numerical.NumericalColumn( data=padata, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), mask=pamask, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) ) if not nan_as_null: data = data.fillna(np.nan) elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) else: try: data = as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( np.asarray(arbitrary, dtype=np.dtype(dtype)), nan_as_null=nan_as_null, ) return data
def libgdf_join(col_lhs, col_rhs, on, how, method='sort'): joiner = _join_how_api[how] method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0) if how not in ['left', 'inner', 'outer']: msg = "new join api only supports left or inner" raise ValueError(msg) list_lhs = [] list_rhs = [] result_cols = [] result_col_names = [] left_idx = [] right_idx = [] # idx = 0 for name, col in col_lhs.items(): list_lhs.append(col._column.cffi_view) if name not in on: result_cols.append(columnview(0, None, dtype=col._column.dtype)) result_col_names.append(name) for name in on: result_cols.append(columnview(0, None, dtype=col_lhs[name]._column.dtype)) result_col_names.append(name) left_idx.append(list(col_lhs.keys()).index(name)) right_idx.append(list(col_rhs.keys()).index(name)) for name, col in col_rhs.items(): list_rhs.append(col._column.cffi_view) if name not in on: result_cols.append(columnview(0, None, dtype=col._column.dtype)) result_col_names.append(name) num_cols_to_join = len(on) result_num_cols = len(list_lhs) + len(list_rhs) - num_cols_to_join joiner(list_lhs, len(list_lhs), left_idx, list_rhs, len(list_rhs), right_idx, num_cols_to_join, result_num_cols, result_cols, ffi.NULL, ffi.NULL, gdf_context) res = [] valids = [] for col in result_cols: intaddr = int(ffi.cast("uintptr_t", col.data)) res.append(rmm.device_array_from_ptr(ptr=intaddr, nelem=col.size, dtype=gdf_to_np_dtype(col.dtype), finalizer=rmm._make_finalizer( intaddr, 0))) intaddr = int(ffi.cast("uintptr_t", col.valid)) valids.append(rmm.device_array_from_ptr(ptr=intaddr, nelem=calc_chunk_size( col.size, mask_bitsize), dtype=mask_dtype, finalizer=rmm._make_finalizer( intaddr, 0))) return res, valids