Example #1
0
    def _concat(cls, objs):
        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")
        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        newsize = sum(map(len, objs))
        # Concatenate data
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem)
        for o in objs:
            data.extend(o.data.to_gpu_array())

        # Concatenate mask if present
        if any(o.has_null_mask for o in objs):
            # FIXME: Inefficient
            mem = rmm.device_array(shape=newsize, dtype=np.bool)
            mask = Buffer.from_empty(mem)
            null_count = 0
            for o in objs:
                mask.extend(o._get_mask_as_column().to_gpu_array())
                null_count += o._null_count
            mask = Buffer(cudautils.compact_mask_bytes(mask.to_gpu_array()))
        else:
            mask = None
            null_count = 0

        col = head.replace(data=data, mask=mask, null_count=null_count)
        return col
Example #2
0
def _mask_from_cuda_array_interface_desc(obj):
    from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize
    from cudf.utils.cudautils import compact_mask_bytes

    desc = obj.__cuda_array_interface__
    mask = desc.get("mask", None)

    if mask is not None:
        desc = mask.__cuda_array_interface__
        ptr = desc["data"][0]
        nelem = desc["shape"][0]
        typestr = desc["typestr"]
        typecode = typestr[1]
        if typecode == "t":
            nelem = calc_chunk_size(nelem, mask_bitsize)
            mask = Buffer(
                data=ptr, size=nelem * mask_dtype.itemsize, owner=obj
            )
        elif typecode == "b":
            dtype = np.dtype(typestr)
            mask = compact_mask_bytes(
                rmm.device_array_from_ptr(
                    ptr, nelem=nelem, dtype=dtype, finalizer=None
                )
            )
            mask = Buffer(mask)
        else:
            raise NotImplementedError(
                f"Cannot infer mask from typestr {typestr}"
            )
    return mask
Example #3
0
    def __getitem__(self, arg):
        if isinstance(arg, Number):
            arg = int(arg)
            return self.element_indexing(arg)
        elif isinstance(arg, slice):
            # compute mask slice
            start, stop = utils.normalize_slice(arg, len(self))
            if self.null_count > 0:
                if arg.step is not None and arg.step != 1:
                    raise NotImplementedError(arg)

                # slicing data
                subdata = self.data[arg]
                # slicing mask
                bytemask = cudautils.expand_mask_bits(
                    self.data.size,
                    self.mask.to_gpu_array(),
                )
                submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg]))
                col = self.replace(data=subdata, mask=submask)
                return col
            else:
                newbuffer = self.data[arg]
                return self.replace(data=newbuffer)
        else:
            raise NotImplementedError(type(arg))
Example #4
0
    def as_mask(self):
        """Convert booleans to bitmask

        Returns
        -------
        device array
        """
        return cudautils.compact_mask_bytes(self.to_gpu_array())
Example #5
0
    def as_mask(self):
        """Convert booleans to bitmask

        Returns
        -------
        device array
        """

        if self.has_nulls:
            raise ValueError("Column must have no nulls.")

        return cudautils.compact_mask_bytes(self.data_array_view)
Example #6
0
def column_select_by_boolmask(column, boolmask):
    """Select by a boolean mask to a column.

    Returns (selected_column, selected_positions)
    """
    from cudf.dataframe.numerical import NumericalColumn
    assert column.null_count == 0  # We don't properly handle the boolmask yet
    boolbits = cudautils.compact_mask_bytes(boolmask.to_gpu_array())
    indices = cudautils.arange(len(boolmask))
    _, selinds = cudautils.copy_to_dense(indices, mask=boolbits)
    _, selvals = cudautils.copy_to_dense(column.data.to_gpu_array(),
                                         mask=boolbits)

    selected_values = column.replace(data=Buffer(selvals))
    selected_index = Buffer(selinds)
    return selected_values, NumericalColumn(data=selected_index,
                                            dtype=selected_index.dtype)
Example #7
0
def pandas_categorical_as_column(categorical, codes=None):
    """Creates a CategoricalColumn from a pandas.Categorical

    If ``codes`` is defined, use it instead of ``categorical.codes``
    """
    codes = categorical.codes if codes is None else codes
    codes = column.as_column(codes)

    valid_codes = codes != -1

    mask = None
    if not np.all(valid_codes):
        mask = cudautils.compact_mask_bytes(valid_codes)
        mask = Buffer(mask)

    return column.build_categorical_column(
        categories=categorical.categories,
        codes=codes,
        mask=mask,
        ordered=categorical.ordered,
    )
Example #8
0
def pandas_categorical_as_column(categorical, codes=None):
    """Creates a CategoricalColumn from a pandas.Categorical

    If ``codes`` is defined, use it instead of ``categorical.codes``
    """
    # TODO fix mutability issue in numba to avoid the .copy()
    codes = (categorical.codes.copy() if codes is None else codes)
    # TODO pending pandas to be improved
    #       https://github.com/pandas-dev/pandas/issues/14711
    #       https://github.com/pandas-dev/pandas/pull/16015
    valid_codes = codes != -1
    buf = Buffer(codes)
    params = dict(data=buf,
                  categories=categorical.categories,
                  ordered=categorical.ordered)
    if not np.all(valid_codes):
        mask = cudautils.compact_mask_bytes(valid_codes)
        nnz = np.count_nonzero(valid_codes)
        null_count = codes.size - nnz
        params.update(dict(mask=Buffer(mask), null_count=null_count))

    return CategoricalColumn(**params)
Example #9
0
    def __getitem__(self, arg):
        from cudf.dataframe import columnops

        if isinstance(arg, Number):
            arg = int(arg)
            return self.element_indexing(arg)
        elif isinstance(arg, slice):
            # compute mask slice
            if self.null_count > 0:
                if arg.step is not None and arg.step != 1:
                    raise NotImplementedError(arg)

                # slicing data
                subdata = self.data[arg]
                # slicing mask
                if self.dtype == "object":
                    data_size = self.data.size()
                else:
                    data_size = self.data.size
                bytemask = cudautils.expand_mask_bits(data_size,
                                                      self.mask.to_gpu_array())
                submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg]))
                col = self.replace(data=subdata, mask=submask)
                return col
            else:
                newbuffer = self.data[arg]
                return self.replace(data=newbuffer)
        else:
            arg = columnops.as_column(arg)
            if len(arg) == 0:
                arg = columnops.as_column([], dtype="int32")
            if pd.api.types.is_integer_dtype(arg.dtype):
                return self.take(arg.data.mem)
            if pd.api.types.is_bool_dtype(arg.dtype):
                return self.apply_boolean_mask(arg)
            raise NotImplementedError(type(arg))
Example #10
0
    def __getitem__(self, arg):
        from cudf.core.column import column

        if isinstance(arg, Number):
            arg = int(arg)
            return self.element_indexing(arg)
        elif isinstance(arg, slice):

            if is_categorical_dtype(self):
                codes = self.codes[arg]
                return build_column(
                    data=None,
                    dtype=self.dtype,
                    mask=codes.mask,
                    children=(codes,),
                )

            start, stop, stride = arg.indices(len(self))
            if start == stop:
                return column_empty(0, self.dtype, masked=True)
            # compute mask slice
            if self.has_nulls:
                if arg.step is not None and arg.step != 1:
                    raise NotImplementedError(arg)

                # slicing data
                slice_data = self.data_array_view[arg]
                # slicing mask
                data_size = self.size
                bytemask = cudautils.expand_mask_bits(
                    data_size, self.mask_array_view
                )
                slice_mask = cudautils.compact_mask_bytes(bytemask[arg])
            else:
                slice_data = self.data_array_view[arg]
                slice_mask = None
            if self.dtype == "object":
                return as_column(slice_data)
            else:
                if arg.step is not None and arg.step != 1:
                    slice_data = cudautils.as_contiguous(slice_data)
                    slice_data = Buffer(slice_data)
                else:
                    # data Buffer lifetime is tied to self:
                    slice_data = Buffer(
                        data=slice_data.device_ctypes_pointer.value,
                        size=slice_data.nbytes,
                        owner=self,
                    )

                # mask Buffer lifetime is not:
                if slice_mask is not None:
                    slice_mask = Buffer(slice_mask)

                return build_column(slice_data, self.dtype, mask=slice_mask)
        else:
            arg = column.as_column(arg)
            if len(arg) == 0:
                arg = column.as_column([], dtype="int32")
            if pd.api.types.is_integer_dtype(arg.dtype):
                return self.take(arg)
            if pd.api.types.is_bool_dtype(arg.dtype):
                return self.apply_boolean_mask(arg)
            raise NotImplementedError(type(arg))