def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): from cudf.utils.cudautils import arange from cudf import Series # Instructions for Slicing # if tuple, get first and last elements of tuple # if open beginning tuple, get 0 to highest valid_index # if open ending tuple, get highest valid_index to len() # if not open end or beginning, get range lowest beginning index # to highest ending index if isinstance(row_tuple, slice): if (isinstance(row_tuple.start, numbers.Number) or isinstance(row_tuple.stop, numbers.Number) or row_tuple == slice(None)): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) return arange(start, stop, step) start_values = self._compute_validity_mask(index, row_tuple.start, max_length) stop_values = self._compute_validity_mask(index, row_tuple.stop, max_length) return Series(arange(start_values.min(), stop_values.max() + 1)) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length)
def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._parent.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = Series(new_cats).drop_duplicates()._column cur_codes = self.codes cur_order = cudautils.arange(len(cur_codes)) old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order").reset_index(True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered) if kwargs.get("inplace", False): self._parent.data = None self._parent.mask = new_codes.mask self._parent.dtype = new_dtype self._parent.children = (new_codes, ) return None return column.build_column( data=None, dtype=new_dtype, mask=new_codes.mask, children=(new_codes, ), )
def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ from cudf import DataFrame from cudf import Series from cudf import concat from cudf.utils.cudautils import arange lookup = DataFrame() for idx, row in enumerate(row_tuple): if row == slice(None): continue lookup[index._source_data.columns[idx]] = Series(row) data_table = concat( [ index._source_data, DataFrame({"idx": Series(arange(len(index._source_data)))}), ], axis=1, ) result = lookup.merge(data_table)["idx"] # Avoid computing levels unless the result of the merge is empty, # which suggests that a KeyError should be raised. if len(result) == 0: for idx, row in enumerate(row_tuple): if row == slice(None): continue if row not in index.levels[idx]._column: raise KeyError(row) return result
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def get_sorted_inds(by, ascending=True, na_position="last"): """ Sort by the values. Parameters ---------- by : Column or list of Column Column or list of Column objects to sort by. ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at the end. Returns ------- col_inds : cuDF Column of indices sorted based on input Difference from pandas: * Support axis='index' only. * Not supporting: inplace, kind * Ascending can be a list of bools to control per column """ if isinstance(by, (ColumnBase)): by = [by] col_inds = column.as_column(cudautils.arange(len(by[0]), dtype="int32")) # This needs to be updated to handle list of bools for ascending if ascending is True: if na_position == "last": na_position = 0 elif na_position == "first": na_position = 1 elif ascending is False: if na_position == "last": na_position = 1 elif na_position == "first": na_position = 0 else: logging.warning( "When using a sequence of booleans for `ascending`, `na_position` " "flag is not yet supported and defaults to treating nulls as " "greater than all numbers") na_position = 0 # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): ascending = [ascending] * len(by) # If given a list-like need to convert to a numpy array and copy to device if isinstance(ascending, collections.abc.Sequence): # Need to flip the boolean here since libcudf has 0 as ascending ascending = [not val for val in ascending] ascending = rmm.to_device(np.array(ascending, dtype="int8")) else: raise ValueError("Must use a boolean or list of booleans") libcudf.sort.order_by(by, col_inds, ascending, na_position) return col_inds
def normalize_chunks(self, size, chunks): if isinstance(chunks, six.integer_types): # *chunks* is the chunksize return cudautils.arange(0, size, chunks) else: # *chunks* is an array of chunk leading offset chunks = column.as_column(chunks) return chunks.data_array_view
def normalize_chunks(self, size, chunks): if isinstance(chunks, six.integer_types): # *chunks* is the chunksize return cudautils.arange(0, size, chunks) else: # *chunks* is an array of chunk leading offset chunks = Series(chunks) return chunks.to_gpu_array()
def as_column(self): if len(self) > 0: vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) else: vals = rmm.device_array(0, dtype=self.dtype) return NumericalColumn(data=Buffer(vals), dtype=vals.dtype, name=self.name)
def sort_by_values(self, ascending): if self.null_count > 0: raise ValueError('nulls not yet supported') # Clone data buffer as the key col_keys = self.replace(data=self.data.copy(), dtype=self._data.dtype) # Create new array for the positions inds = Buffer(cudautils.arange(len(self))) col_inds = self.replace(data=inds, dtype=inds.dtype) cpp_sort.apply_sort(col_keys, col_inds, ascending=ascending) return col_keys, col_inds
def _getitem_tuple_arg(self, arg): from cudf.core.dataframe import Series, DataFrame from cudf.core.column import column from cudf.core.index import as_index from cudf.utils.cudautils import arange from cudf import MultiIndex # Step 1: Gather columns if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if isinstance(columns_df, Series): return columns_df else: columns = self._get_column_selection(arg[1]) columns_df = DataFrame(index=self._df.index) for i, col in enumerate(columns): columns_df.insert(i, col, self._df[col]) # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: if isinstance(self._df.columns, MultiIndex): if isinstance(arg[0], slice): start, stop, step = arg[0].indices(len(columns_df)) indices = arange(start, stop, step) df = columns_df.take(indices) else: df = columns_df.take(arg[0]) else: df = DataFrame() for col in columns_df.columns: # need Series() in case a scalar is returned df[col] = Series(columns_df[col].loc[arg[0]]) df.columns = columns_df.columns # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def _group_inner_levels(self, columns, rowidcol, segs, markers): """Group the second and onwards level. Parameters ---------- columns : sequence[str] Group keys. The order is important. rowid_column : str The name of the special column with the original rowid. It's internally used to determine the shuffling order. df : DataFrame The dataframe being grouped. segs : Series First level group begin offsets. Returns ------- (sorted_keys, reordering_indices, segments) - sorted_keys : list[Series] List of sorted key columns. Column order is same as arg *columns*. - reordering_indices : device array The indices to gather on to shuffle the dataframe into the grouped seqence. - segments : Series Group begin offsets. """ dsegs = segs.astype(dtype=np.int32).data.mem sorted_keys = [] plan_cache = {} for col in columns: # Shuffle the key column according to the previous groups srkeys = self._df[col].take(rowidcol.to_gpu_array(), ignore_index=True) # Segmented sort on the key shuf = Column(Buffer(cudautils.arange(len(srkeys)))) cache_key = (len(srkeys), srkeys.dtype, shuf.dtype) plan = plan_cache.get(cache_key) plan = apply_segsort(srkeys._column, shuf, dsegs, plan=plan) plan_cache[cache_key] = plan sorted_keys.append(srkeys) # keep sorted key cols # Determine segments dsegs, markers = cudautils.find_segments(srkeys.to_gpu_array(), dsegs, markers=markers) # Shuffle rowidcol = rowidcol.take(shuf.to_gpu_array(), ignore_index=True) reordering_indices = rowidcol.to_gpu_array() return sorted_keys, reordering_indices, Series(dsegs)
def _to_frame(self): from cudf import DataFrame # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, column in enumerate(df.columns): # use merge as a replace fn level = DataFrame({'idx': Series(cudautils.arange(len( self.levels[idx]), dtype=df[column].dtype)), 'level': self.levels[idx]}) code = DataFrame({'idx': df[column]}) df[column] = code.merge(level).level return df
def indices_from_labels(obj, labels): from cudf.dataframe import columnops labels = columnops.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") labels._data = labels.data.astype(obj.index._values.data.dtype) else: labels = labels.astype(obj.index.dtype) lhs = cudf.DataFrame({}, index=labels) rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index) return lhs.join(rhs)["_"]
def column_select_by_boolmask(column, boolmask): """Select by a boolean mask to a column. Returns (selected_column, selected_positions) """ from cudf.dataframe.numerical import NumericalColumn assert column.null_count == 0 # We don't properly handle the boolmask yet boolbits = cudautils.compact_mask_bytes(boolmask.to_gpu_array()) indices = cudautils.arange(len(boolmask)) _, selinds = cudautils.copy_to_dense(indices, mask=boolbits) _, selvals = cudautils.copy_to_dense(column.data.to_gpu_array(), mask=boolbits) selected_values = column.replace(data=Buffer(selvals)) selected_index = Buffer(selinds) return selected_values, NumericalColumn(data=selected_index, dtype=selected_index.dtype)
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): indices = indices.to_gpu_array() elif isinstance(indices, slice): start, stop, step, sln = utils.standard_python_slice(len(self), indices) indices = cudautils.arange(start, stop, step) if hasattr(self, '_source_data'): result = MultiIndex(source_data=self._source_data.take(indices)) else: codes = self.codes.take(indices) result = MultiIndex(self.levels, codes) result.names = self.names return result
def indices_from_labels(obj, labels): from cudf.core.column import column labels = column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, ) else: labels = labels.astype(obj.index.dtype) lhs = cudf.DataFrame({}, index=labels) rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index) return lhs.join(rhs)["_"]
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): indices = indices.to_gpu_array() elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = cudautils.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def _getitem_tuple_arg(self, arg): from cudf.dataframe.dataframe import DataFrame from cudf.dataframe.index import as_index from cudf.utils.cudautils import arange from cudf import MultiIndex # Step 1: Gather columns if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) else: columns = self._get_column_selection(arg[1]) columns_df = DataFrame() for col in columns: columns_df.add_column(name=col, data=self._df[col]) # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: if isinstance(self._df.columns, MultiIndex): if isinstance(arg[0], slice): start, stop, step = arg[0].indices(len(columns_df)) indices = arange(start, stop, step) df = columns_df.take(indices) else: df = columns_df.take(arg[0]) else: df = DataFrame() for col in columns_df.columns: df[col] = columns_df[col].loc[arg[0]] # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: df.index = as_index(arg[0]) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def take(self, indices): from collections.abc import Sequence from cudf import Series from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, Series): if indices.null_count != 0: raise ValueError("Column must have no nulls.") indices = indices.data.mem elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = cudautils.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def _compute_levels_and_codes(self): levels = [] from cudf import DataFrame codes = DataFrame() names = [] # Note: This is an O(N^2) solution using gpu masking # to compute new codes for the MultiIndex. There may be # a faster solution that could be executed on gpu at the same # time the groupby is calculated. for by in self._source_data.columns: if len(self._source_data[by]) > 0: level = self._source_data[by].unique() replaced = self._source_data[by].replace( level, Series(cudautils.arange(len(level)))) else: level = np.array([]) replaced = np.array([]) levels.append(level) codes[by] = Series(replaced, dtype="int32") names.append(by) self._levels = levels self._codes = codes self.names = names
def index_from_range(start, stop=None, step=None): vals = cudautils.arange(start, stop, step, dtype=np.int64) return GenericIndex(NumericalColumn(data=Buffer(vals), dtype=vals.dtype))
def index_from_range(start, stop=None, step=None): vals = cudautils.arange(start, stop, step, dtype=np.int64) return as_index(vals)
def _values(self): if len(self) > 0: vals = cudautils.arange(self._start, self._stop, dtype=self.dtype) return column.as_column(vals) else: return column.column_empty(0, masked=False, dtype=self.dtype)
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ from cudf.core import column if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = column.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column" ) key = column.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype=self.codes.dtype) fill_value(data, self._encode(value)) value = build_categorical_column( categories=self.dtype.categories, codes=as_column(data), ordered=self.dtype.ordered, ) elif value is None: value = column.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = column.as_column(value).astype(self.dtype) if len(value) != nelem: msg = ( f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}" ) raise ValueError(msg) if is_categorical_dtype(value.dtype): value = value.cat().set_categories(self.categories) assert self.dtype == value.dtype if isinstance(key, slice): out = libcudf.copying.copy_range( self, value, key_start, key_stop, 0 ) else: try: out = libcudf.copying.scatter(value, key, self) except RuntimeError as e: if "out of bounds" in str(e): raise IndexError( f"index out of bounds for column of size {len(self)}" ) raise self._mimic_inplace(out, inplace=True)
def _get_row_major(self, df, row_tuple): slice_access = False if isinstance(row_tuple[0], numbers.Number): valid_indices = row_tuple[0] elif isinstance(row_tuple[0], slice): # 1. empty slice compute if row_tuple[0].stop == 0: valid_indices = [] else: slice_access = True start = row_tuple[0].start or 0 stop = row_tuple[0].stop or len(df) step = row_tuple[0].step or 1 valid_indices = cudautils.arange(start, stop, step) else: valid_indices = self._compute_validity_mask(df, row_tuple) from cudf import Series result = df.take(Series(valid_indices)) # Build new index - INDEX based MultiIndex # --------------- from cudf import DataFrame out_index = DataFrame() # Select the last n-k columns where n is the number of source # levels and k is the length of the indexing tuple size = 0 if not isinstance(row_tuple[0], (numbers.Number, slice)): size = len(row_tuple) for k in range(size, len(df.index.levels)): out_index.add_column(df.index.names[k], df.index.codes[df.index.codes.columns[k]]) # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the proper codes. if len(out_index.columns) == 1: out_index = [] for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 out_index.append(result.index.levels[ len(result.index.codes.columns)-1][val]) out_index = as_index(out_index) out_index.name = result.index.names[len(result.index.names)-1] result.index = out_index else: if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly result = result.T result = result[result.columns[0]] # convert to Series series_name = [] for idx, code in enumerate(result.columns.codes): series_name.append(result.columns.levels[idx][ result.columns.codes[code][0]]) result = Series(list(result._cols.values())[0], name=series_name) result.name = tuple(series_name) elif(len(out_index.columns)) > 0: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) result.index = result.index._popn(size) return result