def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used to determine whether or not the result of a loc/iloc operation should be "downcasted" from a DataFrame to a Series """ if isinstance(df, cudf.Series): return False nrows, ncols = df.shape if nrows == 1: if type(arg[0]) is slice: if not is_scalar(arg[1]): return False dtypes = df.dtypes.values.tolist() all_numeric = all( [pd.api.types.is_numeric_dtype(t) for t in dtypes] ) all_identical = dtypes.count(dtypes[0]) == len(dtypes) if all_numeric or all_identical: return True if ncols == 1: if type(arg[1]) is slice: if not is_scalar(arg[0]): return False return True return False
def _loc_to_iloc(self, arg): from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray) ): if len(arg) == 0: arg = Series(np.array([], dtype="int32")) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return arg else: return indices_from_labels(self._sr, arg) elif is_scalar(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return found_index elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop ) return slice(start_index, stop_index, arg.step) else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__ ) )
def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules """ nrows, ncols = df.shape # determine the axis along which the Series is taken: if nrows == 1 and ncols == 1: if not is_scalar(arg[0]): axis = 1 else: axis = 0 elif nrows == 1: axis = 0 elif ncols == 1: axis = 1 else: raise ValueError("Cannot downcast DataFrame selection to Series") # take series along the axis: if axis == 1: return df[df.columns[0]] else: df = _normalize_dtypes(df) sr = df.T return sr[sr.columns[0]]
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def _get_column_selection(self, arg): cols = self._df.columns if isinstance(cols, cudf.MultiIndex): return cols._get_column_major(self._df, arg) if is_scalar(arg): return [cols[arg]] else: return cols[arg]
def fillna(self, fill_value, inplace=False): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = columnops.as_column(fill_value, nan_as_null=False) result = cpp_replace.apply_replace_nulls(self, fill_value) result = result.replace(mask=None) return self._mimic_inplace(result, inplace)
def __getitem__(self, arg): if is_scalar(arg): return self.__getattr__(arg) else: arg = list(arg) by_list = [] for by_name, by in zip(self._groupby.key_names, self._groupby.key_columns): by_list.append(cudf.Series(by, name=by_name)) return self._df[arg].groupby( by_list, as_index=self._groupby.as_index, sort=self._groupby.sort, )
def key_from_by(self, by): """ Get (key_name, key_column) pair from a single *by* argument """ if is_scalar(by): key_name = by key_column = self.obj[by]._column else: by = cudf.Series(by) if len(by) != len(self.obj): raise NotImplementedError( "cuDF does not support arbitrary series index lengths " "for groupby") key_name = by.name key_column = by._column return key_name, key_column
def _get_column_selection(self, arg): if is_scalar(arg): return [arg] elif isinstance(arg, slice): start = self._df.columns[0] if arg.start is None else arg.start stop = self._df.columns[-1] if arg.stop is None else arg.stop cols = [] within_slice = False for c in self._df.columns: if c == start: within_slice = True if within_slice: cols.append(c) if c == stop: break return cols else: return arg