Ejemplo n.º 1
0
def _index_or_values_interpolation(column, index=None):
    """
    Interpolate over a float column. assumes a linear interpolation
    strategy using the index of the data to denote spacing of the x
    values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
    would result in [1.0, 3.0, 4.0]
    """
    # figure out where the nans are
    mask = cp.isnan(column)

    # trivial cases, all nan or no nans
    num_nan = mask.sum()
    if num_nan == 0 or num_nan == len(column):
        return column

    to_interp = Frame(data={None: column}, index=index)
    known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))

    known_x = known_x_and_y._index._column.values
    known_y = known_x_and_y._data.columns[0].values

    result = cp.interp(to_interp._index.values, known_x, known_y)

    # find the first nan
    first_nan_idx = (mask == 0).argmax().item()
    result[:first_nan_idx] = np.nan
    return result
Ejemplo n.º 2
0
 def _restore_categorical_keys(self, lhs: Frame,
                               rhs: Frame) -> Tuple[Frame, Frame]:
     # For inner joins, any categorical keys in `self.lhs` and `self.rhs`
     # were casted to their category type to produce `lhs` and `rhs`.
     # Here, we cast them back.
     out_lhs = lhs.copy(deep=False)
     out_rhs = rhs.copy(deep=False)
     if self.how == "inner":
         for left_key, right_key in zip(*self._keys):
             if isinstance(
                     left_key.get(self.lhs).dtype,
                     cudf.CategoricalDtype) and isinstance(
                         right_key.get(self.rhs).dtype,
                         cudf.CategoricalDtype):
                 left_key.set(
                     out_lhs,
                     left_key.get(out_lhs).astype("category"),
                     validate=False,
                 )
                 right_key.set(
                     out_rhs,
                     right_key.get(out_rhs).astype("category"),
                     validate=False,
                 )
     return out_lhs, out_rhs
Ejemplo n.º 3
0
 def _sort_result(self, result: Frame) -> Frame:
     # Pandas sorts on the key columns in the
     # same order as given in 'on'. If the indices are used as
     # keys, the index will be sorted. If one index is specified,
     # the key columns on the other side will be used to sort.
     if self.on:
         if isinstance(result, cudf.BaseIndex):
             sort_order = result._get_sorted_inds()
         else:
             # need a list instead of a tuple here because
             # _get_sorted_inds calls down to ColumnAccessor.get_by_label
             # which handles lists and tuples differently
             sort_order = result._get_sorted_inds(
                 list(_coerce_to_tuple(self.on)))
         return result._gather(sort_order, keep_index=False)
     by = []
     if self.left_index and self.right_index:
         if result._index is not None:
             by.extend(result._index._data.columns)
     if self.left_on:
         by.extend(
             [result._data[col] for col in _coerce_to_tuple(self.left_on)])
     if self.right_on:
         by.extend(
             [result._data[col] for col in _coerce_to_tuple(self.right_on)])
     if by:
         to_sort = cudf.DataFrame._from_columns(by)
         sort_order = to_sort.argsort()
         result = result._gather(sort_order)
     return result
Ejemplo n.º 4
0
 def _sort_result(self, result: Frame) -> Frame:
     # Pandas sorts on the key columns in the
     # same order as given in 'on'. If the indices are used as
     # keys, the index will be sorted. If one index is specified,
     # the key columns on the other side will be used to sort.
     by: List[Any] = []
     if self._using_left_index and self._using_right_index:
         if result._index is not None:
             by.extend(result._index._data.columns)
     if not self._using_left_index:
         by.extend([result._data[col.name] for col in self._left_keys])
     if not self._using_right_index:
         by.extend([result._data[col.name] for col in self._right_keys])
     if by:
         to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
         sort_order = to_sort.argsort()
         if isinstance(result, cudf.core._base_index.BaseIndex):
             result = result._gather(sort_order, check_bounds=False)
         else:
             result = cast(cudf.core.indexed_frame.IndexedFrame, result)
             result = result._gather(
                 sort_order,
                 keep_index=self._using_left_index
                 or self._using_right_index,
                 check_bounds=False,
             )
     return result
Ejemplo n.º 5
0
 def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]:
     # Match the dtypes of the key columns from lhs and rhs
     out_lhs = lhs.copy(deep=False)
     out_rhs = rhs.copy(deep=False)
     for left_key, right_key in zip(*self._keys):
         lcol, rcol = left_key.get(lhs), right_key.get(rhs)
         lcol_casted, rcol_casted = _match_join_keys(lcol,
                                                     rcol,
                                                     how=self.how)
         if lcol is not lcol_casted:
             left_key.set(out_lhs, lcol_casted, validate=False)
         if rcol is not rcol_casted:
             right_key.set(out_rhs, rcol_casted, validate=False)
     return out_lhs, out_rhs
Ejemplo n.º 6
0
    def as_frame(self):
        from cudf.core.frame import Frame

        """
        Converts a Column to Frame
        """
        return Frame({None: self.copy(deep=False)})
Ejemplo n.º 7
0
    def __new__(
        cls,
        levels=None,
        codes=None,
        sortorder=None,
        labels=None,
        names=None,
        dtype=None,
        copy=False,
        name=None,
        **kwargs,
    ) -> "MultiIndex":

        if sortorder is not None:
            raise NotImplementedError("sortorder is not yet supported")

        if name is not None:
            raise NotImplementedError(
                "Use `names`, `name` is not yet supported"
            )

        out = Frame.__new__(cls)
        super(Index, out).__init__()

        if copy:
            if isinstance(codes, cudf.DataFrame):
                codes = codes.copy()
            if len(levels) > 0 and isinstance(levels[0], cudf.Series):
                levels = [level.copy() for level in levels]

        out._name = None

        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' " "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            source_data = kwargs["source_data"].copy(deep=False)
            source_data.reset_index(drop=True, inplace=True)

            if isinstance(source_data, pd.DataFrame):
                nan_as_null = kwargs.get("nan_as_null", None)
                source_data = cudf.DataFrame.from_pandas(
                    source_data, nan_as_null=nan_as_null
                )
            names = names if names is not None else source_data._data.names
            # if names are unique
            # try using those as the source_data column names:
            if len(dict.fromkeys(names)) == len(names):
                source_data.columns = names
            out._data = source_data._data
            out.names = names
            out._codes = codes
            out._levels = levels
            return out

        # name setup
        if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        if not isinstance(codes, cudf.DataFrame) and not isinstance(
            codes[0], (Sequence, np.ndarray)
        ):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, cudf.DataFrame):
            out._codes = codes
        elif len(levels) == len(codes):
            out._codes = cudf.DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                out._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError(
                "MultiIndex has unequal number of levels and "
                "codes and is inconsistent!"
            )

        out._levels = [cudf.Series(level) for level in levels]
        out._validate_levels_and_codes(out._levels, out._codes)

        source_data = cudf.DataFrame()
        for i, name in enumerate(out._codes.columns):
            codes = as_index(out._codes[name]._column)
            if -1 in out._codes[name].values:
                # Must account for null(s) in _source_data column
                level = cudf.DataFrame(
                    {name: [None] + list(out._levels[i])},
                    index=range(-1, len(out._levels[i])),
                )
            else:
                level = cudf.DataFrame({name: out._levels[i]})

            source_data[name] = libcudf.copying.gather(
                level, codes._data.columns[0]
            )._data[name]

        out._data = source_data._data
        out.names = names

        return out
Ejemplo n.º 8
0
def _normalize_columns_and_scalars_type(
    frame: Frame,
    other: Any,
    inplace: bool = False,
) -> Tuple[Union[Frame, ColumnLike], Any, ]:
    """
    Try to normalize the other's dtypes as per frame.

    Parameters
    ----------

    frame : Can be a DataFrame or Series or Index
    other : Can be a DataFrame, Series, Index, Array
        like object or a scalar value

        if frame is DataFrame, other can be only a
        scalar or array like with size of number of columns
        in DataFrame or a DataFrame with same dimension

        if frame is Series, other can be only a scalar or
        a series like with same length as frame

    Returns:
    --------
    A dataframe/series/list/scalar form of normalized other
    """
    if isinstance(frame, DataFrame) and isinstance(other, DataFrame):
        source_df = frame.copy(deep=False)
        other_df = other.copy(deep=False)
        for self_col in source_df._column_names:
            source_col, other_col = _check_and_cast_columns_with_other(
                source_col=source_df._data[self_col],
                other=other_df._data[self_col],
                inplace=inplace,
            )
            source_df._data[self_col] = source_col
            other_df._data[self_col] = other_col
        return source_df, other_df

    elif isinstance(frame,
                    (Series, Index)) and not cudf.api.types.is_scalar(other):
        other = cudf.core.column.as_column(other)
        input_col = frame._data[frame.name]
        return _check_and_cast_columns_with_other(source_col=input_col,
                                                  other=other,
                                                  inplace=inplace)
    else:
        # Handles scalar or list/array like scalars
        if isinstance(frame,
                      (Series, Index)) and cudf.api.types.is_scalar(other):
            input_col = frame._data[frame.name]
            return _check_and_cast_columns_with_other(
                source_col=frame._data[frame.name],
                other=other,
                inplace=inplace,
            )

        elif isinstance(frame, DataFrame):
            source_df = frame.copy(deep=False)
            others = []
            for i, col_name in enumerate(frame._column_names):
                (
                    source_col,
                    other_scalar,
                ) = _check_and_cast_columns_with_other(
                    source_col=source_df._data[col_name],
                    other=other
                    if cudf.api.types.is_scalar(other) else other[i],
                    inplace=inplace,
                )
                source_df._data[col_name] = source_col
                others.append(other_scalar)
            return source_df, others
        else:
            raise ValueError(f"Inappropriate input {type(frame)} "
                             f"and other {type(other)} combination")
Ejemplo n.º 9
0
def where(
    frame: Frame,
    cond: Any,
    other: Any = None,
    inplace: bool = False,
) -> Optional[Union[Frame]]:
    """
    Replace values where the condition is False.

    Parameters
    ----------
    cond : bool Series/DataFrame, array-like
        Where cond is True, keep the original value.
        Where False, replace with corresponding value from other.
        Callables are not supported.
    other: scalar, list of scalars, Series/DataFrame
        Entries where cond is False are replaced with
        corresponding value from other. Callables are not
        supported. Default is None.

        DataFrame expects only Scalar or array like with scalars or
        dataframe with same dimension as frame.

        Series expects only scalar or series like with same length
    inplace : bool, default False
        Whether to perform the operation in place on the data.

    Returns
    -------
    Same type as caller

    Examples
    --------
    >>> import cudf
    >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]})
    >>> df.where(df % 2 == 0, [-1, -1])
       A  B
    0 -1 -1
    1  4 -1
    2 -1  8

    >>> ser = Series([4, 3, 2, 1, 0])
    >>> ser.where(ser > 2, 10)
    0     4
    1     3
    2    10
    3    10
    4    10
    dtype: int64
    >>> ser.where(ser > 2)
    0       4
    1       3
    2    <NA>
    3    <NA>
    4    <NA>
    dtype: int64
    """

    if isinstance(frame, DataFrame):
        if hasattr(cond, "__cuda_array_interface__"):
            if isinstance(cond, Series):
                cond = DataFrame(
                    {name: cond
                     for name in frame._column_names},
                    index=frame.index,
                )
            else:
                cond = DataFrame(cond,
                                 columns=frame._column_names,
                                 index=frame.index)
        elif (hasattr(cond, "__array_interface__")
              and cond.__array_interface__["shape"] != frame.shape):
            raise ValueError("conditional must be same shape as self")
        elif not isinstance(cond, DataFrame):
            cond = frame.from_pandas(pd.DataFrame(cond))

        common_cols = set(frame._column_names).intersection(
            set(cond._column_names))
        if len(common_cols) > 0:
            # If `frame` and `cond` are having unequal index,
            # then re-index `cond`.
            if not frame.index.equals(cond.index):
                cond = cond.reindex(frame.index)
        else:
            if cond.shape != frame.shape:
                raise ValueError(
                    """Array conditional must be same shape as self""")
            # Setting `frame` column names to `cond`
            # as `cond` has no column names.
            cond._set_column_names_like(frame)

        (
            source_df,
            others,
        ) = _normalize_columns_and_scalars_type(frame, other)
        if isinstance(others, Frame):
            others = others._data.columns

        out_df = DataFrame(index=frame.index)
        if len(frame._columns) != len(others):
            raise ValueError(
                """Replacement list length or number of dataframe columns
                should be equal to Number of columns of dataframe""")
        for i, column_name in enumerate(frame._column_names):
            input_col = source_df._data[column_name]
            other_column = others[i]
            if column_name in cond._data:
                if isinstance(input_col, cudf.core.column.CategoricalColumn):
                    if cudf.api.types.is_scalar(other_column):
                        try:
                            other_column = input_col._encode(other_column)
                        except ValueError:
                            # When other is not present in categories,
                            # fill with Null.
                            other_column = None
                        other_column = cudf.Scalar(other_column,
                                                   dtype=input_col.codes.dtype)
                    elif isinstance(other_column,
                                    cudf.core.column.CategoricalColumn):
                        other_column = other_column.codes
                    input_col = input_col.codes

                result = cudf._lib.copying.copy_if_else(
                    input_col, other_column, cond._data[column_name])

                if isinstance(
                        frame._data[column_name],
                        cudf.core.column.CategoricalColumn,
                ):
                    result = cudf.core.column.build_categorical_column(
                        categories=frame._data[column_name].categories,
                        codes=cudf.core.column.build_column(
                            result.base_data, dtype=result.dtype),
                        mask=result.base_mask,
                        size=result.size,
                        offset=result.offset,
                        ordered=frame._data[column_name].ordered,
                    )
            else:
                out_mask = cudf._lib.null_mask.create_null_mask(
                    len(input_col),
                    state=cudf._lib.null_mask.MaskState.ALL_NULL,
                )
                result = input_col.set_mask(out_mask)
            out_df[column_name] = frame[column_name].__class__(result)

        return frame._mimic_inplace(out_df, inplace=inplace)

    else:
        frame = cast(SingleColumnFrame, frame)
        if isinstance(other, DataFrame):
            raise NotImplementedError(
                "cannot align with a higher dimensional Frame")
        input_col = frame._data[frame.name]
        cond = cudf.core.column.as_column(cond)
        if len(cond) != len(frame):
            raise ValueError(
                """Array conditional must be same shape as self""")

        (
            input_col,
            other,
        ) = _normalize_columns_and_scalars_type(frame, other, inplace)

        if isinstance(input_col, cudf.core.column.CategoricalColumn):
            if cudf.api.types.is_scalar(other):
                try:
                    other = input_col._encode(other)
                except ValueError:
                    # When other is not present in categories,
                    # fill with Null.
                    other = None
                other = cudf.Scalar(other, dtype=input_col.codes.dtype)
            elif isinstance(other, cudf.core.column.CategoricalColumn):
                other = other.codes

            input_col = input_col.codes

        result = cudf._lib.copying.copy_if_else(input_col, other, cond)

        if isinstance(frame._data[frame.name],
                      cudf.core.column.CategoricalColumn):
            result = cudf.core.column.build_categorical_column(
                categories=cast(
                    cudf.core.column.CategoricalColumn,
                    frame._data[frame.name],
                ).categories,
                codes=cudf.core.column.build_column(result.base_data,
                                                    dtype=result.dtype),
                mask=result.base_mask,
                size=result.size,
                offset=result.offset,
                ordered=cast(
                    cudf.core.column.CategoricalColumn,
                    frame._data[frame.name],
                ).ordered,
            )

        if isinstance(frame, Index):
            result = Index(result, name=frame.name)
        else:
            result = frame._from_data({frame.name: result}, frame._index)

        return frame._mimic_inplace(result, inplace=inplace)