def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] if cudf._lib.scalar._is_null_host_scalar(scalar): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): if dtype is None: return _categorical_scalar_broadcast_to(scalar, size) else: return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, decimal.Decimal): if dtype is None: dtype = cudf.Decimal128Dtype._from_decimal(scalar) out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col[:] = scalar return out_col scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype)
def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] if scalar is None or (isinstance(scalar, (np.datetime64, np.timedelta64)) and np.isnat(scalar)): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): if dtype is None: return _categorical_scalar_broadcast_to(scalar, size) else: return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype).kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col.data_array_view[:] = scalar return out_col
def scalar_broadcast_to(scalar, size, dtype=None): from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype from cudf.core.column import column_empty if isinstance(size, (tuple, list)): size = size[0] if scalar is None: if dtype is None: dtype = "object" return column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None): dtype = "object" else: scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype) == np.dtype("object"): from cudf.core.column import as_column gather_map = cupy.zeros(size, dtype="int32") scalar_str_col = as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column_empty(size, dtype=dtype) if out_col.size != 0: out_col.data_array_view[:] = scalar return out_col
def scalar_broadcast_to(scalar, size, dtype=None): from cudf.utils.cudautils import fill_value from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype from cudf.core.column import column_empty if isinstance(size, (tuple, list)): size = size[0] if scalar is None: if dtype is None: dtype = "object" return column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None): dtype = "object" else: scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype) == np.dtype("object"): import nvstrings from cudf.core.column import as_column from cudf.utils.cudautils import zeros gather_map = zeros(size, dtype="int32") scalar_str_col = as_column(nvstrings.to_device([scalar])) return scalar_str_col[gather_map] else: da = rmm.device_array((size, ), dtype=dtype) if da.size != 0: fill_value(da, scalar) return da
def query_execute(df, expr, callenv): """Compile & execute the query expression Note: the expression is compiled and cached for future reuse. Parameters ---------- df : DataFrame expr : str boolean expression callenv : dict Contains keys 'local_dict', 'locals' and 'globals' which are all dict. They represent the arg, local and global dictionaries of the caller. """ # compile compiled = query_compile(expr) columns = compiled["colnames"] # prepare col args colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns] # wait to check the types until we know which cols are used if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays): raise TypeError( "query only supports numeric, datetime, timedelta, " "or bool dtypes." ) colarrays = [col.data_array_view for col in colarrays] kernel = compiled["kernel"] # process env args envargs = [] envdict = callenv["globals"].copy() envdict.update(callenv["locals"]) envdict.update(callenv["local_dict"]) for name in compiled["refnames"]: name = name[len(ENVREF_PREFIX) :] try: val = envdict[name] if isinstance(val, dt.datetime): val = np.datetime64(val) except KeyError: msg = "{!r} not defined in the calling environment" raise NameError(msg.format(name)) else: envargs.append(val) # allocate output buffer nrows = len(df) out = column_empty(nrows, dtype=np.bool_) # run kernel args = [out] + colarrays + envargs kernel.forall(nrows)(*args) out_mask = applyutils.make_aggregate_nullmask(df, columns=columns) return out.set_mask(out_mask).fillna(False)
def column_hash_values(column0, *other_columns, initial_hash_values=None): """Hash all values in the given columns. Returns a new NumericalColumn[int32] """ from cudf.core.column import column_empty columns = [column0] + list(other_columns) result = column_empty(len(column0), dtype=np.int32, masked=False) if initial_hash_values: initial_hash_values = rmm.to_device(initial_hash_values) libcudf.hash.hash_columns(columns, result, initial_hash_values) return result
def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] if cudf._lib.scalar._is_null_host_scalar(scalar): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): if dtype is None: return _categorical_scalar_broadcast_to(scalar, size) else: return scalar_broadcast_to(scalar.categories[0], size).astype(dtype) if isinstance(scalar, decimal.Decimal): if dtype is None: dtype = cudf.Decimal64Dtype._from_decimal(scalar) out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col[:] = scalar return out_col scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if cudf.dtype(dtype).kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col.data_array_view[:] = scalar return out_col
def get_null_series(size, dtype=np.bool_): """ Creates a null series of provided dtype and size Parameters ---------- size: length of series dtype: dtype of series to create; defaults to bool. Returns ------- a null cudf series of provided `size` and `dtype` """ empty_col = column.column_empty(size, dtype, True) return cudf.Series(empty_col)
def query_execute(df, expr, callenv): """Compile & execute the query expression Note: the expression is compiled and cached for future reuse. Parameters ---------- df : DataFrame expr : str boolean expression callenv : dict Contains keys 'local_dict', 'locals' and 'globals' which are all dict. They represent the arg, local and global dictionaries of the caller. """ # compile compiled = query_compile(expr) kernel = compiled["kernel"] # process env args envargs = [] envdict = callenv["globals"].copy() envdict.update(callenv["locals"]) envdict.update(callenv["local_dict"]) for name in compiled["refnames"]: name = name[len(ENVREF_PREFIX):] try: val = envdict[name] if isinstance(val, dt.datetime): val = np.datetime64(val) except KeyError: msg = "{!r} not defined in the calling environment" raise NameError(msg.format(name)) else: envargs.append(val) columns = compiled["colnames"] # prepare col args colarrays = [df[col]._column.data_array_view for col in columns] # allocate output buffer nrows = len(df) out = column_empty(nrows, dtype=np.bool_) # run kernel args = [out] + colarrays + envargs kernel.forall(nrows)(*args) out_mask = applyutils.make_aggregate_nullmask(df, columns=columns) return out.set_mask(out_mask).fillna(False)
def applymap( self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None ) -> ColumnBase: """Apply an element-wise function to transform the values in the Column. Parameters ---------- udf : function Wrapped by numba jit for call on the GPU as a device function. out_dtype : numpy.dtype; optional The dtype for use in the output. By default, use the same dtype as *self.dtype*. Returns ------- result : Column The mask is preserved. """ if out_dtype is None: out_dtype = self.dtype core = njit(udf) # For non-masked columns @cuda.jit def kernel_applymap(values, results): i = cuda.grid(1) # in range? if i < values.size: # call udf results[i] = core(values[i]) results = column_empty(self.size, dtype=out_dtype) values = self.data_array_view kernel_applymap.forall(self.size)(values, results) return as_column(results)
def size(self): from cudf.core.column import column_empty nrows = len(self._groupby.obj) data = cudf.Series(column_empty(nrows, "int8", masked=False)) return data.groupby(self._groupby.key_columns).count()
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.core.dataframe import DataFrame, Series from cudf.core.column import column_empty from cudf.core.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns = self._get_column_selection(arg[1]) if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if (len(columns_df) == 0 and len(columns_df.columns) == 0 and not isinstance(arg[0], slice)): result = Series(column_empty(0, dtype="float64"), name=arg[0]) result._index = columns_df.columns.copy(deep=False) return result else: if isinstance(arg[0], slice): columns_df = DataFrame() for i, col in enumerate(columns): columns_df.insert(i, col, self._df[col]) columns_df._index = self._df._index else: columns_df = self._df._columns_view(columns) # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance( arg[0], slice) or isinstance(arg[1], slice)): # Pandas returns a numpy scalar in this case return df[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: df = DataFrame() for i, col in enumerate(columns_df._columns): # need Series() in case a scalar is returned df[i] = Series(col[arg[0]]) df.index = as_index(columns_df.index[arg[0]]) df.columns = columns_df.columns # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index if isinstance(arg[0], slice): start = arg[0].start if start is None: start = 0 df.index = as_index(self._df.index[start]) else: df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): if isinstance(df.columns, MultiIndex): if len(df) > 0 and not (isinstance(arg[0], slice) or isinstance(arg[1], slice)): return list(df._data.values())[0][0] elif df.shape[1] > 1: result = self._downcast_to_series(df, arg) result.index = df.columns return result elif not isinstance(arg[0], slice): if len(df._data) == 0: return Series( column_empty(0, dtype="float64"), index=df.columns, name=arg[0], ) else: result_series = df[df.columns[0]] result_series.index = df.columns result_series.name = arg[0] return result_series else: return df[df.columns[0]] return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0: from cudf.core.index import RangeIndex slice_len = arg[0].stop or len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df