def _index_or_values_interpolation(column, index=None): """ Interpolate over a float column. assumes a linear interpolation strategy using the index of the data to denote spacing of the x values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] would result in [1.0, 3.0, 4.0] """ # figure out where the nans are mask = cp.isnan(column) # trivial cases, all nan or no nans num_nan = mask.sum() if num_nan == 0 or num_nan == len(column): return column to_interp = Frame(data={None: column}, index=index) known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = known_x_and_y._index._column.values known_y = known_x_and_y._data.columns[0].values result = cp.interp(to_interp._index.values, known_x, known_y) # find the first nan first_nan_idx = (mask == 0).argmax().item() result[:first_nan_idx] = np.nan return result
def _restore_categorical_keys(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: # For inner joins, any categorical keys in `self.lhs` and `self.rhs` # were casted to their category type to produce `lhs` and `rhs`. # Here, we cast them back. out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) if self.how == "inner": for left_key, right_key in zip(*self._keys): if isinstance( left_key.get(self.lhs).dtype, cudf.CategoricalDtype) and isinstance( right_key.get(self.rhs).dtype, cudf.CategoricalDtype): left_key.set( out_lhs, left_key.get(out_lhs).astype("category"), validate=False, ) right_key.set( out_rhs, right_key.get(out_rhs).astype("category"), validate=False, ) return out_lhs, out_rhs
def _sort_result(self, result: Frame) -> Frame: # Pandas sorts on the key columns in the # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. if self.on: if isinstance(result, cudf.BaseIndex): sort_order = result._get_sorted_inds() else: # need a list instead of a tuple here because # _get_sorted_inds calls down to ColumnAccessor.get_by_label # which handles lists and tuples differently sort_order = result._get_sorted_inds( list(_coerce_to_tuple(self.on))) return result._gather(sort_order, keep_index=False) by = [] if self.left_index and self.right_index: if result._index is not None: by.extend(result._index._data.columns) if self.left_on: by.extend( [result._data[col] for col in _coerce_to_tuple(self.left_on)]) if self.right_on: by.extend( [result._data[col] for col in _coerce_to_tuple(self.right_on)]) if by: to_sort = cudf.DataFrame._from_columns(by) sort_order = to_sort.argsort() result = result._gather(sort_order) return result
def _sort_result(self, result: Frame) -> Frame: # Pandas sorts on the key columns in the # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. by: List[Any] = [] if self._using_left_index and self._using_right_index: if result._index is not None: by.extend(result._index._data.columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: by.extend([result._data[col.name] for col in self._right_keys]) if by: to_sort = cudf.DataFrame._from_data(dict(enumerate(by))) sort_order = to_sort.argsort() if isinstance(result, cudf.core._base_index.BaseIndex): result = result._gather(sort_order, check_bounds=False) else: result = cast(cudf.core.indexed_frame.IndexedFrame, result) result = result._gather( sort_order, keep_index=self._using_left_index or self._using_right_index, check_bounds=False, ) return result
def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: # Match the dtypes of the key columns from lhs and rhs out_lhs = lhs.copy(deep=False) out_rhs = rhs.copy(deep=False) for left_key, right_key in zip(*self._keys): lcol, rcol = left_key.get(lhs), right_key.get(rhs) lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, how=self.how) if lcol is not lcol_casted: left_key.set(out_lhs, lcol_casted, validate=False) if rcol is not rcol_casted: right_key.set(out_rhs, rcol_casted, validate=False) return out_lhs, out_rhs
def as_frame(self): from cudf.core.frame import Frame """ Converts a Column to Frame """ return Frame({None: self.copy(deep=False)})
def __new__( cls, levels=None, codes=None, sortorder=None, labels=None, names=None, dtype=None, copy=False, name=None, **kwargs, ) -> "MultiIndex": if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") if name is not None: raise NotImplementedError( "Use `names`, `name` is not yet supported" ) out = Frame.__new__(cls) super(Index, out).__init__() if copy: if isinstance(codes, cudf.DataFrame): codes = codes.copy() if len(levels) > 0 and isinstance(levels[0], cudf.Series): levels = [level.copy() for level in levels] out._name = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: source_data = kwargs["source_data"].copy(deep=False) source_data.reset_index(drop=True, inplace=True) if isinstance(source_data, pd.DataFrame): nan_as_null = kwargs.get("nan_as_null", None) source_data = cudf.DataFrame.from_pandas( source_data, nan_as_null=nan_as_null ) names = names if names is not None else source_data._data.names # if names are unique # try using those as the source_data column names: if len(dict.fromkeys(names)) == len(names): source_data.columns = names out._data = source_data._data out.names = names out._codes = codes out._levels = levels return out # name setup if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, cudf.DataFrame) and not isinstance( codes[0], (Sequence, np.ndarray) ): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, cudf.DataFrame): out._codes = codes elif len(levels) == len(codes): out._codes = cudf.DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) out._codes[name] = codes.astype(np.int64) else: raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) out._levels = [cudf.Series(level) for level in levels] out._validate_levels_and_codes(out._levels, out._codes) source_data = cudf.DataFrame() for i, name in enumerate(out._codes.columns): codes = as_index(out._codes[name]._column) if -1 in out._codes[name].values: # Must account for null(s) in _source_data column level = cudf.DataFrame( {name: [None] + list(out._levels[i])}, index=range(-1, len(out._levels[i])), ) else: level = cudf.DataFrame({name: out._levels[i]}) source_data[name] = libcudf.copying.gather( level, codes._data.columns[0] )._data[name] out._data = source_data._data out.names = names return out
def _normalize_columns_and_scalars_type( frame: Frame, other: Any, inplace: bool = False, ) -> Tuple[Union[Frame, ColumnLike], Any, ]: """ Try to normalize the other's dtypes as per frame. Parameters ---------- frame : Can be a DataFrame or Series or Index other : Can be a DataFrame, Series, Index, Array like object or a scalar value if frame is DataFrame, other can be only a scalar or array like with size of number of columns in DataFrame or a DataFrame with same dimension if frame is Series, other can be only a scalar or a series like with same length as frame Returns: -------- A dataframe/series/list/scalar form of normalized other """ if isinstance(frame, DataFrame) and isinstance(other, DataFrame): source_df = frame.copy(deep=False) other_df = other.copy(deep=False) for self_col in source_df._column_names: source_col, other_col = _check_and_cast_columns_with_other( source_col=source_df._data[self_col], other=other_df._data[self_col], inplace=inplace, ) source_df._data[self_col] = source_col other_df._data[self_col] = other_col return source_df, other_df elif isinstance(frame, (Series, Index)) and not cudf.api.types.is_scalar(other): other = cudf.core.column.as_column(other) input_col = frame._data[frame.name] return _check_and_cast_columns_with_other(source_col=input_col, other=other, inplace=inplace) else: # Handles scalar or list/array like scalars if isinstance(frame, (Series, Index)) and cudf.api.types.is_scalar(other): input_col = frame._data[frame.name] return _check_and_cast_columns_with_other( source_col=frame._data[frame.name], other=other, inplace=inplace, ) elif isinstance(frame, DataFrame): source_df = frame.copy(deep=False) others = [] for i, col_name in enumerate(frame._column_names): ( source_col, other_scalar, ) = _check_and_cast_columns_with_other( source_col=source_df._data[col_name], other=other if cudf.api.types.is_scalar(other) else other[i], inplace=inplace, ) source_df._data[col_name] = source_col others.append(other_scalar) return source_df, others else: raise ValueError(f"Inappropriate input {type(frame)} " f"and other {type(other)} combination")
def where( frame: Frame, cond: Any, other: Any = None, inplace: bool = False, ) -> Optional[Union[Frame]]: """ Replace values where the condition is False. Parameters ---------- cond : bool Series/DataFrame, array-like Where cond is True, keep the original value. Where False, replace with corresponding value from other. Callables are not supported. other: scalar, list of scalars, Series/DataFrame Entries where cond is False are replaced with corresponding value from other. Callables are not supported. Default is None. DataFrame expects only Scalar or array like with scalars or dataframe with same dimension as frame. Series expects only scalar or series like with same length inplace : bool, default False Whether to perform the operation in place on the data. Returns ------- Same type as caller Examples -------- >>> import cudf >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) >>> df.where(df % 2 == 0, [-1, -1]) A B 0 -1 -1 1 4 -1 2 -1 8 >>> ser = Series([4, 3, 2, 1, 0]) >>> ser.where(ser > 2, 10) 0 4 1 3 2 10 3 10 4 10 dtype: int64 >>> ser.where(ser > 2) 0 4 1 3 2 <NA> 3 <NA> 4 <NA> dtype: int64 """ if isinstance(frame, DataFrame): if hasattr(cond, "__cuda_array_interface__"): if isinstance(cond, Series): cond = DataFrame( {name: cond for name in frame._column_names}, index=frame.index, ) else: cond = DataFrame(cond, columns=frame._column_names, index=frame.index) elif (hasattr(cond, "__array_interface__") and cond.__array_interface__["shape"] != frame.shape): raise ValueError("conditional must be same shape as self") elif not isinstance(cond, DataFrame): cond = frame.from_pandas(pd.DataFrame(cond)) common_cols = set(frame._column_names).intersection( set(cond._column_names)) if len(common_cols) > 0: # If `frame` and `cond` are having unequal index, # then re-index `cond`. if not frame.index.equals(cond.index): cond = cond.reindex(frame.index) else: if cond.shape != frame.shape: raise ValueError( """Array conditional must be same shape as self""") # Setting `frame` column names to `cond` # as `cond` has no column names. cond._set_column_names_like(frame) ( source_df, others, ) = _normalize_columns_and_scalars_type(frame, other) if isinstance(others, Frame): others = others._data.columns out_df = DataFrame(index=frame.index) if len(frame._columns) != len(others): raise ValueError( """Replacement list length or number of dataframe columns should be equal to Number of columns of dataframe""") for i, column_name in enumerate(frame._column_names): input_col = source_df._data[column_name] other_column = others[i] if column_name in cond._data: if isinstance(input_col, cudf.core.column.CategoricalColumn): if cudf.api.types.is_scalar(other_column): try: other_column = input_col._encode(other_column) except ValueError: # When other is not present in categories, # fill with Null. other_column = None other_column = cudf.Scalar(other_column, dtype=input_col.codes.dtype) elif isinstance(other_column, cudf.core.column.CategoricalColumn): other_column = other_column.codes input_col = input_col.codes result = cudf._lib.copying.copy_if_else( input_col, other_column, cond._data[column_name]) if isinstance( frame._data[column_name], cudf.core.column.CategoricalColumn, ): result = cudf.core.column.build_categorical_column( categories=frame._data[column_name].categories, codes=cudf.core.column.build_column( result.base_data, dtype=result.dtype), mask=result.base_mask, size=result.size, offset=result.offset, ordered=frame._data[column_name].ordered, ) else: out_mask = cudf._lib.null_mask.create_null_mask( len(input_col), state=cudf._lib.null_mask.MaskState.ALL_NULL, ) result = input_col.set_mask(out_mask) out_df[column_name] = frame[column_name].__class__(result) return frame._mimic_inplace(out_df, inplace=inplace) else: frame = cast(SingleColumnFrame, frame) if isinstance(other, DataFrame): raise NotImplementedError( "cannot align with a higher dimensional Frame") input_col = frame._data[frame.name] cond = cudf.core.column.as_column(cond) if len(cond) != len(frame): raise ValueError( """Array conditional must be same shape as self""") ( input_col, other, ) = _normalize_columns_and_scalars_type(frame, other, inplace) if isinstance(input_col, cudf.core.column.CategoricalColumn): if cudf.api.types.is_scalar(other): try: other = input_col._encode(other) except ValueError: # When other is not present in categories, # fill with Null. other = None other = cudf.Scalar(other, dtype=input_col.codes.dtype) elif isinstance(other, cudf.core.column.CategoricalColumn): other = other.codes input_col = input_col.codes result = cudf._lib.copying.copy_if_else(input_col, other, cond) if isinstance(frame._data[frame.name], cudf.core.column.CategoricalColumn): result = cudf.core.column.build_categorical_column( categories=cast( cudf.core.column.CategoricalColumn, frame._data[frame.name], ).categories, codes=cudf.core.column.build_column(result.base_data, dtype=result.dtype), mask=result.base_mask, size=result.size, offset=result.offset, ordered=cast( cudf.core.column.CategoricalColumn, frame._data[frame.name], ).ordered, ) if isinstance(frame, Index): result = Index(result, name=frame.name) else: result = frame._from_data({frame.name: result}, frame._index) return frame._mimic_inplace(result, inplace=inplace)