Ejemplo n.º 1
0
Archivo: utils.py Proyecto: rongou/cudf
def scalar_broadcast_to(scalar, size, dtype=None):

    if isinstance(size, (tuple, list)):
        size = size[0]

    if cudf._lib.scalar._is_null_host_scalar(scalar):
        if dtype is None:
            dtype = "object"
        return column.column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        if dtype is None:
            return _categorical_scalar_broadcast_to(scalar, size)
        else:
            return scalar_broadcast_to(scalar.categories[0],
                                       size).astype(dtype)

    if isinstance(scalar, decimal.Decimal):
        if dtype is None:
            dtype = cudf.Decimal128Dtype._from_decimal(scalar)

        out_col = column.column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col[:] = scalar
        return out_col

    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
    dtype = scalar.dtype

    return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype)
Ejemplo n.º 2
0
def scalar_broadcast_to(scalar, size, dtype=None):

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None or (isinstance(scalar, (np.datetime64, np.timedelta64))
                          and np.isnat(scalar)):
        if dtype is None:
            dtype = "object"
        return column.column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        if dtype is None:
            return _categorical_scalar_broadcast_to(scalar, size)
        else:
            return scalar_broadcast_to(scalar.categories[0],
                                       size).astype(dtype)

    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
    dtype = scalar.dtype

    if np.dtype(dtype).kind in ("O", "U"):
        gather_map = column.full(size, 0, dtype="int32")
        scalar_str_col = column.as_column([scalar], dtype="str")
        return scalar_str_col[gather_map]
    else:
        out_col = column.column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col.data_array_view[:] = scalar
        return out_col
Ejemplo n.º 3
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        from cudf.core.column import as_column

        gather_map = cupy.zeros(size, dtype="int32")
        scalar_str_col = as_column([scalar], dtype="str")
        return scalar_str_col[gather_map]
    else:
        out_col = column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col.data_array_view[:] = scalar
        return out_col
Ejemplo n.º 4
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.cudautils import fill_value
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        import nvstrings
        from cudf.core.column import as_column
        from cudf.utils.cudautils import zeros

        gather_map = zeros(size, dtype="int32")
        scalar_str_col = as_column(nvstrings.to_device([scalar]))
        return scalar_str_col[gather_map]
    else:
        da = rmm.device_array((size, ), dtype=dtype)
        if da.size != 0:
            fill_value(da, scalar)
        return da
Ejemplo n.º 5
0
def query_execute(df, expr, callenv):
    """Compile & execute the query expression

    Note: the expression is compiled and cached for future reuse.

    Parameters
    ----------
    df : DataFrame
    expr : str
        boolean expression
    callenv : dict
        Contains keys 'local_dict', 'locals' and 'globals' which are all dict.
        They represent the arg, local and global dictionaries of the caller.
    """

    # compile
    compiled = query_compile(expr)
    columns = compiled["colnames"]

    # prepare col args
    colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns]

    # wait to check the types until we know which cols are used
    if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays):
        raise TypeError(
            "query only supports numeric, datetime, timedelta, "
            "or bool dtypes."
        )

    colarrays = [col.data_array_view for col in colarrays]

    kernel = compiled["kernel"]
    # process env args
    envargs = []
    envdict = callenv["globals"].copy()
    envdict.update(callenv["locals"])
    envdict.update(callenv["local_dict"])
    for name in compiled["refnames"]:
        name = name[len(ENVREF_PREFIX) :]
        try:
            val = envdict[name]
            if isinstance(val, dt.datetime):
                val = np.datetime64(val)
        except KeyError:
            msg = "{!r} not defined in the calling environment"
            raise NameError(msg.format(name))
        else:
            envargs.append(val)

    # allocate output buffer
    nrows = len(df)
    out = column_empty(nrows, dtype=np.bool_)
    # run kernel
    args = [out] + colarrays + envargs
    kernel.forall(nrows)(*args)
    out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
    return out.set_mask(out_mask).fillna(False)
Ejemplo n.º 6
0
def column_hash_values(column0, *other_columns, initial_hash_values=None):
    """Hash all values in the given columns.
    Returns a new NumericalColumn[int32]
    """
    from cudf.core.column import column_empty

    columns = [column0] + list(other_columns)
    result = column_empty(len(column0), dtype=np.int32, masked=False)
    if initial_hash_values:
        initial_hash_values = rmm.to_device(initial_hash_values)
    libcudf.hash.hash_columns(columns, result, initial_hash_values)
    return result
Ejemplo n.º 7
0
def scalar_broadcast_to(scalar, size, dtype=None):

    if isinstance(size, (tuple, list)):
        size = size[0]

    if cudf._lib.scalar._is_null_host_scalar(scalar):
        if dtype is None:
            dtype = "object"
        return column.column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        if dtype is None:
            return _categorical_scalar_broadcast_to(scalar, size)
        else:
            return scalar_broadcast_to(scalar.categories[0],
                                       size).astype(dtype)

    if isinstance(scalar, decimal.Decimal):
        if dtype is None:
            dtype = cudf.Decimal64Dtype._from_decimal(scalar)

        out_col = column.column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col[:] = scalar
        return out_col

    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
    dtype = scalar.dtype

    if cudf.dtype(dtype).kind in ("O", "U"):
        gather_map = column.full(size, 0, dtype="int32")
        scalar_str_col = column.as_column([scalar], dtype="str")
        return scalar_str_col[gather_map]
    else:
        out_col = column.column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col.data_array_view[:] = scalar
        return out_col
Ejemplo n.º 8
0
def get_null_series(size, dtype=np.bool_):
    """
    Creates a null series of provided dtype and size

    Parameters
    ----------
    size:  length of series
    dtype: dtype of series to create; defaults to bool.

    Returns
    -------
    a null cudf series of provided `size` and `dtype`
    """

    empty_col = column.column_empty(size, dtype, True)
    return cudf.Series(empty_col)
Ejemplo n.º 9
0
def query_execute(df, expr, callenv):
    """Compile & execute the query expression

    Note: the expression is compiled and cached for future reuse.

    Parameters
    ----------
    df : DataFrame
    expr : str
        boolean expression
    callenv : dict
        Contains keys 'local_dict', 'locals' and 'globals' which are all dict.
        They represent the arg, local and global dictionaries of the caller.
    """
    # compile
    compiled = query_compile(expr)
    kernel = compiled["kernel"]
    # process env args
    envargs = []
    envdict = callenv["globals"].copy()
    envdict.update(callenv["locals"])
    envdict.update(callenv["local_dict"])
    for name in compiled["refnames"]:
        name = name[len(ENVREF_PREFIX):]
        try:
            val = envdict[name]
            if isinstance(val, dt.datetime):
                val = np.datetime64(val)
        except KeyError:
            msg = "{!r} not defined in the calling environment"
            raise NameError(msg.format(name))
        else:
            envargs.append(val)
    columns = compiled["colnames"]
    # prepare col args
    colarrays = [df[col]._column.data_array_view for col in columns]
    # allocate output buffer
    nrows = len(df)
    out = column_empty(nrows, dtype=np.bool_)
    # run kernel
    args = [out] + colarrays + envargs
    kernel.forall(nrows)(*args)
    out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
    return out.set_mask(out_mask).fillna(False)
Ejemplo n.º 10
0
    def applymap(
        self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
    ) -> ColumnBase:
        """Apply an element-wise function to transform the values in the Column.

        Parameters
        ----------
        udf : function
            Wrapped by numba jit for call on the GPU as a device function.
        out_dtype  : numpy.dtype; optional
            The dtype for use in the output.
            By default, use the same dtype as *self.dtype*.

        Returns
        -------
        result : Column
            The mask is preserved.
        """
        if out_dtype is None:
            out_dtype = self.dtype

        core = njit(udf)

        # For non-masked columns
        @cuda.jit
        def kernel_applymap(values, results):
            i = cuda.grid(1)
            # in range?
            if i < values.size:
                # call udf
                results[i] = core(values[i])

        results = column_empty(self.size, dtype=out_dtype)
        values = self.data_array_view
        kernel_applymap.forall(self.size)(values, results)
        return as_column(results)
Ejemplo n.º 11
0
    def size(self):
        from cudf.core.column import column_empty

        nrows = len(self._groupby.obj)
        data = cudf.Series(column_empty(nrows, "int8", masked=False))
        return data.groupby(self._groupby.key_columns).count()
Ejemplo n.º 12
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.dataframe import DataFrame, Series
        from cudf.core.column import column_empty
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (len(columns_df) == 0 and len(columns_df.columns) == 0
                    and not isinstance(arg[0], slice)):
                result = Series(column_empty(0, dtype="float64"), name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for i, col in enumerate(columns):
                    columns_df.insert(i, col, self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance(
                    arg[0], slice) or isinstance(arg[1], slice)):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for i, col in enumerate(columns_df._columns):
                # need Series() in case a scalar is returned
                df[i] = Series(col[arg[0]])

            df.index = as_index(columns_df.index[arg[0]])
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (isinstance(arg[0], slice)
                                        or isinstance(arg[1], slice)):
                    return list(df._data.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    if len(df._data) == 0:
                        return Series(
                            column_empty(0, dtype="float64"),
                            index=df.columns,
                            name=arg[0],
                        )
                    else:
                        result_series = df[df.columns[0]]
                        result_series.index = df.columns
                        result_series.name = arg[0]
                        return result_series
                else:
                    return df[df.columns[0]]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.core.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df