def scalar_broadcast_to(scalar, size, dtype=None): from cudf.utils.cudautils import fill_value from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype from cudf.core.column import column_empty if isinstance(size, (tuple, list)): size = size[0] if scalar is None: if dtype is None: dtype = "object" return column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None): dtype = "object" else: scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.core.column import as_column from cudf.utils.cudautils import zeros gather_map = zeros(size, dtype="int32") scalar_str_col = as_column(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array((size, ), dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def scalar_broadcast_to(scalar, shape, dtype): from cudf.utils.cudautils import fill_value if not isinstance(shape, tuple): shape = (shape, ) if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.dataframe.string import StringColumn from cudf.utils.cudautils import zeros gather_map = zeros(shape[0], dtype='int32') scalar_str_col = StringColumn(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array(shape, dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def _apply_agg(self, agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=True): """ Parameters ---------- agg_type : str The aggregation function to run. result : DataFrame The DataFrame to store the result of the aggregation into. add_col_values : bool Boolean to indicate whether this is the first aggregation being run and should add the additional columns' values. ctx : gdf_context cffi object Context object to pass information such as if the dataframe is sorted and/or which method to use for grouping. val_columns : list of *str* The list of column names that the aggregation should be performed on. val_columns_out : list of *str* The list of columns names that the aggregation results should be output into. """ if sort_result: ctx.flag_sort_result = 1 ncols = len(self._by) cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by] first_run = add_col_values need_to_index = self._as_index col_count = 0 if isinstance(val_columns, (str, Number)): val_columns = [val_columns] for val_col in val_columns: col_agg = self._df[val_col]._column.cffi_view # assuming here that if there are multiple aggregations that the # aggregated results will be in the same order for GDF_SORT method if need_to_index: out_col_indices_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.int32 ) ) ) out_col_indices = out_col_indices_series._column.cffi_view else: out_col_indices = ffi.NULL out_col_values_series = [] for i in range(0, ncols): if self._df[self._by[i]].dtype == np.dtype('object'): # This isn't ideal, but no better way to create an # nvstrings object of correct size gather_map = zeros(col_agg.size, dtype='int32') col = Series([''], dtype='str')[gather_map]\ .reset_index(drop=True) else: col = Series( Buffer( rmm.device_array( col_agg.size, dtype=self._df[self._by[i]]._column.data.dtype ) ) ) out_col_values_series.append(col) out_col_values = [ out_col_values_series[i]._column.cffi_view for i in range(0, ncols)] if agg_type == "count": out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.int64 ) ) ) elif agg_type == "mean": out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.float64 ) ) ) else: if self._df[val_col].dtype == np.dtype('object'): # This isn't ideal, but no better way to create an # nvstrings object of correct size gather_map = zeros(col_agg.size, dtype='int32') out_col_agg_series = Series( [''], dtype='str' )[gather_map].reset_index(drop=True) else: out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=self._df[val_col]._column.data.dtype ) ) ) out_col_agg = out_col_agg_series._column.cffi_view agg_func = self._NAMED_FUNCTIONS.get(agg_type, None) if agg_func is None: raise RuntimeError( "ERROR: this aggregator has not been implemented yet") err = agg_func( ncols, cols, col_agg, out_col_indices, out_col_values, out_col_agg, ctx) if (err is not None): raise RuntimeError(err) num_row_results = out_col_agg.size # NVStrings columns are not the same going in as coming out but we # can't create entire CFFI views otherwise multiple objects will # try to free the memory for i, col in enumerate(out_col_values_series): if col.dtype == np.dtype("object") and len(col) > 0: import nvcategory nvcat_ptr = int( ffi.cast( "uintptr_t", out_col_values[i].dtype_info.category ) ) nvcat_obj = None if nvcat_ptr: nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() else: import nvstrings nvstr_obj = nvstrings.to_device([]) out_col_values_series[i]._column._data = nvstr_obj out_col_values_series[i]._column._nvcategory = nvcat_obj if out_col_agg_series.dtype == np.dtype("object") and \ len(out_col_agg_series) > 0: import nvcategory nvcat_ptr = int( ffi.cast( "uintptr_t", out_col_agg.dtype_info.category ) ) nvcat_obj = None if nvcat_ptr: nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() else: import nvstrings nvstr_obj = nvstrings.to_device([]) out_col_agg_series._column._data = nvstr_obj out_col_agg_series._column._nvcategory = nvcat_obj if first_run: for i, thisBy in enumerate(self._by): result[thisBy] = out_col_values_series[i][ :num_row_results] if is_categorical_dtype(self._df[thisBy].dtype): result[thisBy] = CategoricalColumn( data=result[thisBy].data, categories=self._df[thisBy].cat.categories, ordered=self._df[thisBy].cat.ordered ) if out_col_agg_series.dtype != np.dtype("object"): out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index(drop=True) if isinstance(val_columns_out, (str, Number)): result[val_columns_out] = out_col_agg_series[:num_row_results] else: result[val_columns_out[col_count] ] = out_col_agg_series[:num_row_results] if out_col_agg_series.dtype != np.dtype("object"): out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index(drop=True) first_run = False col_count = col_count + 1 return result