Exemple #1
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.cudautils import fill_value
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        import nvstrings
        from cudf.core.column import as_column
        from cudf.utils.cudautils import zeros

        gather_map = zeros(size, dtype="int32")
        scalar_str_col = as_column(nvstrings.to_device([scalar]))
        return scalar_str_col[gather_map]
    else:
        da = rmm.device_array((size, ), dtype=dtype)
        if da.size != 0:
            fill_value(da, scalar)
        return da
Exemple #2
0
def scalar_broadcast_to(scalar, shape, dtype):
    from cudf.utils.cudautils import fill_value

    if not isinstance(shape, tuple):
        shape = (shape, )

    if np.dtype(dtype) == np.dtype("object"):
        import nvstrings
        from cudf.dataframe.string import StringColumn
        from cudf.utils.cudautils import zeros
        gather_map = zeros(shape[0], dtype='int32')
        scalar_str_col = StringColumn(nvstrings.to_device([scalar]))
        return scalar_str_col[gather_map]
    else:
        da = rmm.device_array(shape, dtype=dtype)
        if da.size != 0:
            fill_value(da, scalar)
        return da
Exemple #3
0
    def _apply_agg(self, agg_type, result, add_col_values,
                   ctx, val_columns, val_columns_out, sort_result=True):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        result : DataFrame
            The DataFrame to store the result of the aggregation into.
        add_col_values : bool
            Boolean to indicate whether this is the first aggregation being
            run and should add the additional columns' values.
        ctx : gdf_context cffi object
            Context object to pass information such as if the dataframe
            is sorted and/or which method to use for grouping.
        val_columns : list of *str*
            The list of column names that the aggregation should be performed
            on.
        val_columns_out : list of *str*
            The list of columns names that the aggregation results should be
            output into.
        """

        if sort_result:
            ctx.flag_sort_result = 1

        ncols = len(self._by)
        cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by]

        first_run = add_col_values
        need_to_index = self._as_index

        col_count = 0
        if isinstance(val_columns, (str, Number)):
            val_columns = [val_columns]
        for val_col in val_columns:
            col_agg = self._df[val_col]._column.cffi_view

            # assuming here that if there are multiple aggregations that the
            # aggregated results will be in the same order for GDF_SORT method
            if need_to_index:
                out_col_indices_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int32
                        )
                    )
                )
                out_col_indices = out_col_indices_series._column.cffi_view
            else:
                out_col_indices = ffi.NULL

            out_col_values_series = []
            for i in range(0, ncols):
                if self._df[self._by[i]].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    col = Series([''], dtype='str')[gather_map]\
                        .reset_index(drop=True)
                else:
                    col = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[self._by[i]]._column.data.dtype
                            )
                        )
                    )
                out_col_values_series.append(col)
            out_col_values = [
                out_col_values_series[i]._column.cffi_view
                for i in range(0, ncols)]

            if agg_type == "count":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int64
                        )
                    )
                )
            elif agg_type == "mean":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.float64
                        )
                    )
                )
            else:
                if self._df[val_col].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    out_col_agg_series = Series(
                        [''],
                        dtype='str'
                    )[gather_map].reset_index(drop=True)
                else:
                    out_col_agg_series = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[val_col]._column.data.dtype
                            )
                        )
                    )

            out_col_agg = out_col_agg_series._column.cffi_view

            agg_func = self._NAMED_FUNCTIONS.get(agg_type, None)
            if agg_func is None:
                raise RuntimeError(
                    "ERROR: this aggregator has not been implemented yet")

            err = agg_func(
                ncols,
                cols,
                col_agg,
                out_col_indices,
                out_col_values,
                out_col_agg,
                ctx)

            if (err is not None):
                raise RuntimeError(err)

            num_row_results = out_col_agg.size

            # NVStrings columns are not the same going in as coming out but we
            # can't create entire CFFI views otherwise multiple objects will
            # try to free the memory
            for i, col in enumerate(out_col_values_series):
                if col.dtype == np.dtype("object") and len(col) > 0:
                    import nvcategory
                    nvcat_ptr = int(
                        ffi.cast(
                            "uintptr_t",
                            out_col_values[i].dtype_info.category
                        )
                    )
                    nvcat_obj = None
                    if nvcat_ptr:
                        nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                        nvstr_obj = nvcat_obj.to_strings()
                    else:
                        import nvstrings
                        nvstr_obj = nvstrings.to_device([])
                    out_col_values_series[i]._column._data = nvstr_obj
                    out_col_values_series[i]._column._nvcategory = nvcat_obj
            if out_col_agg_series.dtype == np.dtype("object") and \
                    len(out_col_agg_series) > 0:
                import nvcategory
                nvcat_ptr = int(
                    ffi.cast(
                        "uintptr_t",
                        out_col_agg.dtype_info.category
                    )
                )
                nvcat_obj = None
                if nvcat_ptr:
                    nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                    nvstr_obj = nvcat_obj.to_strings()
                else:
                    import nvstrings
                    nvstr_obj = nvstrings.to_device([])
                out_col_agg_series._column._data = nvstr_obj
                out_col_agg_series._column._nvcategory = nvcat_obj

            if first_run:
                for i, thisBy in enumerate(self._by):
                    result[thisBy] = out_col_values_series[i][
                        :num_row_results]

                    if is_categorical_dtype(self._df[thisBy].dtype):
                        result[thisBy] = CategoricalColumn(
                            data=result[thisBy].data,
                            categories=self._df[thisBy].cat.categories,
                            ordered=self._df[thisBy].cat.ordered
                        )

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            if isinstance(val_columns_out, (str, Number)):
                result[val_columns_out] = out_col_agg_series[:num_row_results]
            else:
                result[val_columns_out[col_count]
                       ] = out_col_agg_series[:num_row_results]

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            first_run = False
            col_count = col_count + 1

        return result