Beispiel #1
0
    def unordered_compare(self, cmpop, rhs):
        lhs, rhs = self, rhs
        return binop(lhs, rhs, op=_unordered_impl[cmpop], out_dtype=np.bool)

    def to_pandas(self, index):
        return pd.Series(self.to_array().astype(self.dtype), index=index)

    def to_arrow(self):
        mask = None
        if self.has_null_mask:
            mask = pa.py_buffer(self.nullmask.mem.copy_to_host())
        data = pa.py_buffer(self.data.mem.copy_to_host().view('int64'))
        pa_dtype = _gdf.np_to_pa_dtype(self.dtype)
        return pa.Array.from_buffers(type=pa_dtype,
                                     length=len(self),
                                     buffers=[mask, data],
                                     null_count=self.null_count)


def binop(lhs, rhs, op, out_dtype):
    nvtx_range_push("PYGDF_BINARY_OP", "orange")
    masked = lhs.has_null_mask or rhs.has_null_mask
    out = columnops.column_empty_like(lhs, dtype=out_dtype, masked=masked)
    null_count = _gdf.apply_binaryop(op, lhs, rhs, out)
    out = out.replace(null_count=null_count)
    nvtx_range_pop()
    return out


register_distributed_serializer(DatetimeColumn)
Beispiel #2
0
    Returns
    -------
    result : subclass of Index
        - CategoricalIndex for Categorical input.
        - DatetimeIndex for Datetime input.
        - GenericIndex for all other inputs.
    """
    # This function should probably be moved to Index.__new__
    if isinstance(arbitrary, Index):
        return arbitrary
    elif isinstance(arbitrary, NumericalColumn):
        return GenericIndex(arbitrary, name=name)
    elif isinstance(arbitrary, DatetimeColumn):
        return DatetimeIndex(arbitrary, name=name)
    elif isinstance(arbitrary, CategoricalColumn):
        return CategoricalIndex(arbitrary, name=name)
    else:
        name = None
        if hasattr(arbitrary, 'name'):
            name = arbitrary.name
        if len(arbitrary) == 0:
            return RangeIndex(0, 0, name=name)
        return as_index(columnops.as_column(arbitrary), name=name)


register_distributed_serializer(RangeIndex)
register_distributed_serializer(GenericIndex)
register_distributed_serializer(DatetimeIndex)
register_distributed_serializer(CategoricalIndex)
Beispiel #3
0
            Whether to use the list of quantiles as index.

        Returns
        -------

        DataFrame

        """
        if not quant_index:
            return Series(self._column.quantile(q, interpolation, exact))
        else:
            return Series(self._column.quantile(q, interpolation, exact),
                          index=as_index(np.asarray(q)))


register_distributed_serializer(Series)

truediv_int_dtype_corrections = {
    'int64': 'float64',
    'int32': 'float32',
    'int': 'float',
}


class DatetimeProperties(object):
    def __init__(self, series):
        self.series = series

    @property
    def year(self):
        return self.get_dt_field('year')
Beispiel #4
0
        return out

    def is_contiguous(self):
        return self.mem.is_c_contiguous()


class BufferSentryError(ValueError):
    pass


class _BufferSentry(object):
    def __init__(self, buf):
        self._buf = buf

    def dtype(self, dtype):
        if self._buf.dtype != dtype:
            raise BufferSentryError('dtype mismatch')
        return self

    def ndim(self, ndim):
        if self._buf.ndim != ndim:
            raise BufferSentryError('ndim mismatch')
        return self

    def contig(self):
        if not self._buf.is_c_contiguous():
            raise BufferSentryError('non contiguous')


register_distributed_serializer(Buffer)
Beispiel #5
0
def numeric_column_unaryop(operand, op, out_dtype):
    out = columnops.column_empty_like_same_mask(operand, dtype=out_dtype)
    _gdf.apply_unaryop(op, operand, out)
    return out.view(NumericalColumn, dtype=out_dtype)


def numeric_column_compare(lhs, rhs, op):
    return numeric_column_binop(lhs, rhs, op, out_dtype=np.bool_)


def numeric_normalize_types(*args):
    """Cast all args to a common type using numpy promotion logic
    """
    dtype = np.result_type(*[a.dtype for a in args])
    return [a.astype(dtype) for a in args]


def column_hash_values(column0, *other_columns):
    """Hash all values in the given columns.
    Returns a new NumericalColumn[int32]
    """
    columns = [column0] + list(other_columns)
    buf = Buffer(rmm.device_array(len(column0), dtype=np.int32))
    result = NumericalColumn(data=buf, dtype=buf.dtype)
    _gdf.hash_columns(columns, result)
    return result


register_distributed_serializer(NumericalColumn)
Beispiel #6
0
        Convert from a Pandas MultiIndex

        Raises
        ------
        TypeError for invalid input type.

        Examples
        --------
        >>> import cudf
        >>> import pandas as pd
        >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']],
                                codes=[[0, 1], [1, ]])
        >>> cudf.from_pandas(pmi)
        MultiIndex( ... )
        """
        if not isinstance(multiindex, pd.MultiIndex):
            raise TypeError('not a pandas.MultiIndex')

        if hasattr(multiindex, 'codes'):
            mi = cls(levels=multiindex.levels,
                     codes=multiindex.codes,
                     names=multiindex.names)
        else:
            mi = cls(levels=multiindex.levels,
                     codes=multiindex.labels,
                     names=multiindex.names)
        return mi


register_distributed_serializer(MultiIndex)
Beispiel #7
0
                                          outcols={'out1': np.int32,
                                                   'out2': np.int32},
                                          # threads per block
                                          tpb=8)

            print(result)

        Output:

        .. code-block:: python

               key  val out1 out2
            0    0    0    0    0
            1    0    1    0    1
            2    1    2    2    3
            3    1    3    3    4
            4    2    4    8    6
            5    2    5   10    7
            6    2    6   12    8

        """
        if not callable(function):
            raise TypeError("type {!r} is not callable", type(function))

        df, segs = self.as_df()
        kwargs.update({'chunks': segs})
        return df.apply_chunks(function, **kwargs)


register_distributed_serializer(Groupby)
Beispiel #8
0
            return joined_index, indexers
        else:
            return joined_index


def pandas_categorical_as_column(categorical, codes=None):
    """Creates a CategoricalColumn from a pandas.Categorical

    If ``codes`` is defined, use it instead of ``categorical.codes``
    """
    # TODO fix mutability issue in numba to avoid the .copy()
    codes = (categorical.codes.copy() if codes is None else codes)
    # TODO pending pandas to be improved
    #       https://github.com/pandas-dev/pandas/issues/14711
    #       https://github.com/pandas-dev/pandas/pull/16015
    valid_codes = codes != -1
    buf = Buffer(codes)
    params = dict(data=buf,
                  categories=categorical.categories,
                  ordered=categorical.ordered)
    if not np.all(valid_codes):
        mask = cudautils.compact_mask_bytes(valid_codes)
        nnz = np.count_nonzero(valid_codes)
        null_count = codes.size - nnz
        params.update(dict(mask=Buffer(mask), null_count=null_count))

    return CategoricalColumn(**params)


register_distributed_serializer(CategoricalColumn)
Beispiel #9
0
        -------
        begin, end : 2-tuple of int
            The starting index and the ending index.
            The *last* value occurs at ``end - 1`` position.
        """
        col = self._values
        begin, end = None, None
        if first is not None:
            begin = col.find_first_value(first)
        if last is not None:
            end = col.find_last_value(last)
            end += 1
        return begin, end


register_distributed_serializer(RangeIndex)
register_distributed_serializer(GenericIndex)


class DatetimeIndex(GenericIndex):
    # TODO this constructor should take a timezone or something to be
    # consistent with pandas
    def __new__(self, values, name=None):
        # we should be more strict on what we accept here but
        # we'd have to go and figure out all the semantics around
        # pandas dtindex creation first which.  For now
        # just make sure we handle np.datetime64 arrays
        # and then just dispatch upstream
        if isinstance(values, np.ndarray) and values.dtype.kind == 'M':
            values = DatetimeColumn.from_numpy(values)
        elif isinstance(values, pd.DatetimeIndex):