Esempio n. 1
0
    def _can_downcast_to_series(self, df, arg):
        """
        This method encapsulates the logic used
        to determine whether or not the result of a loc/iloc
        operation should be "downcasted" from a DataFrame to a
        Series
        """
        from cudf.core.column import as_column

        if isinstance(df, cudf.Series):
            return False
        nrows, ncols = df.shape
        if nrows == 1:
            if type(arg[0]) is slice:
                if not is_scalar(arg[1]):
                    return False
            else:
                # row selection using boolean indexing - never downcasts
                if pd.api.types.is_bool_dtype(as_column(arg[0]).dtype):
                    return False
            dtypes = df.dtypes.values.tolist()
            all_numeric = all(
                [pd.api.types.is_numeric_dtype(t) for t in dtypes])
            if all_numeric:
                return True
        if ncols == 1:
            if type(arg[1]) is slice:
                if not is_scalar(arg[0]):
                    return False
            if isinstance(arg[1], tuple):
                # Multiindex indexing with a slice
                if any(isinstance(v, slice) for v in arg):
                    return False
            return True
        return False
Esempio n. 2
0
    def _downcast_to_series(self, df, arg):
        """
        "Downcast" from a DataFrame to a Series
        based on Pandas indexing rules
        """
        nrows, ncols = df.shape
        # determine the axis along which the Series is taken:
        if nrows == 1 and ncols == 1:
            if is_scalar(arg[0]) and is_scalar(arg[1]):
                return df[df.columns[0]][0]
            elif not is_scalar(arg[0]):
                axis = 1
            else:
                axis = 0

        elif nrows == 1:
            axis = 0
        elif ncols == 1:
            axis = 1
        else:
            raise ValueError("Cannot downcast DataFrame selection to Series")

        # take series along the axis:
        if axis == 1:
            return df[df._data.names[0]]
        else:
            df = _normalize_dtypes(df)
            sr = df.T
            return sr[sr._data.names[0]]
Esempio n. 3
0
 def __setitem__(self, key, value):
     try:
         key = self._loc_to_iloc(key)
     except KeyError as e:
         if (is_scalar(key)
                 and not isinstance(self._sr.index, cudf.MultiIndex)
                 and is_scalar(value)):
             _append_new_row_inplace(self._sr.index._values, key)
             _append_new_row_inplace(self._sr._column, value)
             return
         else:
             raise e
     if isinstance(value, (pd.Series, cudf.Series)):
         value = cudf.Series(value)
         value = value._align_to_index(self._sr.index, how="right")
     self._sr.iloc[key] = value
Esempio n. 4
0
    def _loc_to_iloc(self, arg):
        from cudf.core.series import Series
        from cudf.core.index import Index

        if isinstance(
                arg,
            (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)):
            if len(arg) == 0:
                arg = Series(np.array([], dtype="int32"))
            else:
                arg = Series(arg)
        if isinstance(arg, Series):
            if arg.dtype in [np.bool, np.bool_]:
                return arg
            else:
                return indices_from_labels(self._sr, arg)
        elif is_scalar(arg):
            found_index = self._sr.index.find_label_range(arg, None)[0]
            return found_index
        elif isinstance(arg, slice):
            start_index, stop_index = self._sr.index.find_label_range(
                arg.start, arg.stop)
            return slice(start_index, stop_index, arg.step)
        else:
            raise NotImplementedError(
                ".loc not implemented for label type {}".format(
                    type(arg).__name__))
Esempio n. 5
0
    def __setitem__(self, key, value):
        from cudf.core.column import column

        if isinstance(key, tuple):
            key = list(key)

        # coerce value into a scalar or column
        if is_scalar(value):
            value = to_cudf_compatible_scalar(value)
        elif not (isinstance(value, (list, dict))
                  and isinstance(self._sr._column.dtype,
                                 (cudf.ListDtype, cudf.StructDtype))):
            value = column.as_column(value)

        if (not isinstance(
                self._sr._column.dtype,
            (cudf.Decimal64Dtype, cudf.CategoricalDtype),
        ) and hasattr(value, "dtype")
                and _is_non_decimal_numeric_dtype(value.dtype)):
            # normalize types if necessary:
            if not is_integer(key):
                to_dtype = np.result_type(value.dtype, self._sr._column.dtype)
                value = value.astype(to_dtype)
                self._sr._column._mimic_inplace(
                    self._sr._column.astype(to_dtype), inplace=True)

        self._sr._column[key] = value
Esempio n. 6
0
    def fillna(self, fill_value):
        col = self
        if is_scalar(fill_value):
            if isinstance(fill_value, np.timedelta64):
                dtype = determine_out_dtype(self.dtype, fill_value.dtype)
                fill_value = fill_value.astype(dtype)
                col = col.astype(dtype)
            elif not isinstance(fill_value, Scalar):
                fill_value = np.timedelta64(fill_value)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(col, fill_value)
        if isinstance(fill_value, np.timedelta64) and np.isnat(fill_value):
            # If the value we are filling is np.timedelta64("NAT")
            # we set the same mask as current column.
            # However where there are "<NA>" in the
            # columns, their corresponding locations
            # in base_data will contain min(int64) values.

            return column.build_column(
                data=result.base_data,
                dtype=result.dtype,
                mask=self.base_mask,
                size=result.size,
                offset=result.offset,
                children=result.base_children,
            )
        return result
Esempio n. 7
0
    def __setitem__(self, key, value):
        from cudf.core.column import column

        if isinstance(key, tuple):
            key = list(key)

        # coerce value into a scalar or column
        if is_scalar(value):
            value = to_cudf_compatible_scalar(value)
        else:
            value = column.as_column(value)

        if (
            not is_categorical_dtype(self._sr._column.dtype)
            and hasattr(value, "dtype")
            and pd.api.types.is_numeric_dtype(value.dtype)
        ):
            # normalize types if necessary:
            if not pd.api.types.is_integer(key):
                to_dtype = np.result_type(value.dtype, self._sr._column.dtype)
                value = value.astype(to_dtype)
                self._sr._column._mimic_inplace(
                    self._sr._column.astype(to_dtype), inplace=True
                )

        self._sr._column[key] = value
Esempio n. 8
0
    def _loc_to_iloc(self, arg):
        from cudf.core.column import column
        from cudf.core.series import Series

        if is_scalar(arg):
            try:
                found_index = self._sr.index._values.find_first_value(
                    arg, closest=False
                )
                return found_index
            except (TypeError, KeyError, IndexError, ValueError):
                raise KeyError("label scalar is out of bound")

        elif isinstance(arg, slice):
            return get_label_range_or_mask(
                self._sr.index, arg.start, arg.stop, arg.step
            )
        elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)):
            if isinstance(arg, pd.MultiIndex):
                arg = cudf.MultiIndex.from_pandas(arg)

            return indices_from_labels(self._sr, arg)

        else:
            arg = Series(column.as_column(arg))
            if arg.dtype in (bool, np.bool_):
                return arg
            else:
                indices = indices_from_labels(self._sr, arg)
                if indices.null_count > 0:
                    raise KeyError("label scalar is out of bound")
                return indices
Esempio n. 9
0
    def __getitem__(self, index):
        from numbers import Number

        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            sln = (stop - start) // step
            sln = max(0, sln)
            start += self._start
            stop += self._start
            if sln == 0:
                return RangeIndex(0, None, self.name)
            elif step == 1:
                return RangeIndex(start, stop, self.name)
            else:
                return index_from_range(start, stop, step)

        elif isinstance(index, Number):
            index = utils.normalize_index(index, len(self))
            index += self._start
            return index
        elif isinstance(index, (list, np.ndarray)):
            index = np.asarray(index)
            index = rmm.to_device(index)

        else:
            if is_scalar(index):
                index = min_signed_type(index)(index)
            index = column.as_column(index)

        return as_index(self._values[index], name=self.name)
Esempio n. 10
0
    def _setitem_tuple_arg(self, key, value):
        if isinstance(self._df.index, cudf.MultiIndex) or isinstance(
                self._df.columns, pd.MultiIndex):
            raise NotImplementedError(
                "Setting values using df.loc[] not supported on "
                "DataFrames with a MultiIndex")

        try:
            columns = self._get_column_selection(key[1])
        except KeyError:
            if not self._df.empty and isinstance(key[0], slice):
                pos_range = get_label_range_or_mask(self._df.index,
                                                    key[0].start, key[0].stop,
                                                    key[0].step)
                idx = self._df.index[pos_range]
            elif self._df.empty and isinstance(key[0], slice):
                idx = None
            else:
                idx = cudf.Index(key[0])
            if is_scalar(value):
                length = len(idx) if idx is not None else 1
                value = as_column(value, length=length)

            new_col = cudf.Series(value, index=idx)
            if not self._df.empty:
                new_col = new_col._align_to_index(self._df.index, how="right")

            if self._df.empty:
                self._df.index = (idx if idx is not None else cudf.RangeIndex(
                    len(new_col)))
            self._df._data.insert(key[1], new_col)
        else:
            for col in columns:
                self._df[col].loc[key[0]] = value
Esempio n. 11
0
 def _get_column_selection(self, arg):
     cols = self._df.columns
     if isinstance(cols, cudf.MultiIndex):
         return cols._get_column_major(self._df, arg)
     if is_scalar(arg):
         return [cols[arg]]
     else:
         return cols[arg]
Esempio n. 12
0
 def normalize_binop_value(self, other):
     if is_scalar(other) and isinstance(other, (int, np.int, Decimal)):
         return cudf.Scalar(Decimal(other))
     elif isinstance(other, cudf.Scalar) and isinstance(
             other.dtype, cudf.Decimal64Dtype):
         return other
     else:
         raise TypeError(f"cannot normalize {type(other)}")
Esempio n. 13
0
 def __getitem__(self, arg):
     if isinstance(arg, tuple):
         arg = list(arg)
     data = self._sr._column[arg]
     if is_scalar(data) or data is None:
         return data
     index = self._sr.index.take(arg)
     return self._sr._copy_construct(data=data, index=index)
Esempio n. 14
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns_df = self._get_column_selection(arg[1])
        columns_df._index = self._df._index

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg[0], slice):
                df = columns_df[arg[0]]
            else:
                df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df.iloc[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            if isinstance(arg[0], slice):
                df = columns_df._slice(arg[0])
            elif is_scalar(arg[0]):
                index = arg[0]
                if index < 0:
                    index += len(columns_df)
                df = columns_df._slice(slice(index, index + 1, 1))
            else:
                arg = (column.as_column(arg[0]), arg[1])
                if pd.api.types.is_bool_dtype(arg[0]):
                    df = columns_df._apply_boolean_mask(arg[0])
                else:
                    df = columns_df._gather(arg[0])

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)

        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
            from cudf.core.index import RangeIndex

            slice_len = len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Esempio n. 15
0
    def fillna(self, fill_value):
        if is_scalar(fill_value):
            fill_value = np.datetime64(fill_value, self.time_unit)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(self, fill_value)
        result = column.build_column(result.data, result.dtype, mask=None)

        return result
Esempio n. 16
0
    def fillna(self, fill_value, inplace=False):
        if is_scalar(fill_value):
            fill_value = np.datetime64(fill_value, self.time_unit)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(self, fill_value)

        result = result.replace(mask=None)
        return self._mimic_inplace(result, inplace)
Esempio n. 17
0
    def fillna(self, fill_value=None, method=None):
        if fill_value is not None:
            if cudf.utils.utils.isnat(fill_value):
                return _fillna_natwise(self)
            if is_scalar(fill_value):
                if not isinstance(fill_value, cudf.Scalar):
                    fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
            else:
                fill_value = column.as_column(fill_value, nan_as_null=False)

        return super().fillna(fill_value, method)
Esempio n. 18
0
    def fillna(self, fill_value):
        if cudf.utils.utils.isnat(fill_value):
            return _fillna_natwise(self)
        if is_scalar(fill_value):
            if not isinstance(fill_value, cudf.Scalar):
                fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(self, fill_value)
        return result
Esempio n. 19
0
    def __getitem__(self, arg):
        if is_scalar(arg):
            return self.__getattr__(arg)
        else:
            arg = list(arg)
            by = None
            if self._groupby.level is None:
                by = self._groupby.key_columns

            return self._df[arg].groupby(
                by=by,
                level=self._groupby.level,
                as_index=self._groupby.as_index,
                sort=self._groupby.sort,
                dropna=self._groupby.dropna,
            )
Esempio n. 20
0
    def fillna(self, fill_value):
        if cudf.utils.utils.isnat(fill_value):
            return _fillna_natwise(self)
        col = self
        if is_scalar(fill_value):
            if isinstance(fill_value, np.timedelta64):
                dtype = determine_out_dtype(self.dtype, fill_value.dtype)
                fill_value = fill_value.astype(dtype)
                col = col.astype(dtype)
            if not isinstance(fill_value, cudf.Scalar):
                fill_value = cudf.Scalar(fill_value, dtype=dtype)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(col, fill_value)
        return result
Esempio n. 21
0
 def key_from_by(self, by):
     """
     Get (key_name, key_column) pair from a single *by* argument
     """
     if is_scalar(by):
         self.df_key_names.append(by)
         key_name = by
         key_column = self.obj[by]
     else:
         if len(by) != len(self.obj):
             raise NotImplementedError(
                 "cuDF does not support arbitrary series index lengths "
                 "for groupby")
         key_name = by.name
         key_column = by
     return key_name, key_column
Esempio n. 22
0
    def fillna(self, fill_value=None, method=None):
        if fill_value is not None:
            if cudf.utils.utils.isnat(fill_value):
                return _fillna_natwise(self)
            col = self
            if is_scalar(fill_value):
                if isinstance(fill_value, np.timedelta64):
                    dtype = determine_out_dtype(self.dtype, fill_value.dtype)
                    fill_value = fill_value.astype(dtype)
                    col = col.astype(dtype)
                if not isinstance(fill_value, cudf.Scalar):
                    fill_value = cudf.Scalar(fill_value, dtype=dtype)
            else:
                fill_value = column.as_column(fill_value, nan_as_null=False)

            return ColumnBase.fillna(col, fill_value)
        else:
            return super().fillna(method=method)
Esempio n. 23
0
    def _get_column_selection(self, arg):
        if is_scalar(arg):
            return [arg]

        elif isinstance(arg, slice):
            start = self._df.columns[0] if arg.start is None else arg.start
            stop = self._df.columns[-1] if arg.stop is None else arg.stop
            cols = []
            within_slice = False
            for c in self._df.columns:
                if c == start:
                    within_slice = True
                if within_slice:
                    cols.append(c)
                if c == stop:
                    break
            return cols

        else:
            return arg
Esempio n. 24
0
    def _loc_to_iloc(self, arg):
        if is_scalar(arg):
            if not is_numerical_dtype(self._sr.index.dtype):
                # TODO: switch to cudf.utils.dtypes.is_integer(arg)
                if isinstance(arg,
                              cudf.Scalar) and pd.api.types.is_integer_dtype(
                                  arg.dtype):
                    found_index = arg.value
                    return found_index
                elif pd.api.types.is_integer(arg):
                    found_index = arg
                    return found_index
            try:
                found_index = self._sr.index._values.find_first_value(
                    arg, closest=False)
                return found_index
            except (TypeError, KeyError, IndexError, ValueError):
                raise KeyError("label scalar is out of bound")

        elif isinstance(arg, slice):
            return get_label_range_or_mask(self._sr.index, arg.start, arg.stop,
                                           arg.step)
        elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)):
            if isinstance(arg, pd.MultiIndex):
                arg = cudf.MultiIndex.from_pandas(arg)

            return indices_from_labels(self._sr, arg)

        else:
            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
            if arg.dtype in (bool, np.bool_):
                return arg
            else:
                indices = indices_from_labels(self._sr, arg)
                if indices.null_count > 0:
                    raise KeyError("label scalar is out of bound")
                return indices
Esempio n. 25
0
    def join(
        self, other, how="left", level=None, return_indexers=False, sort=False
    ):
        """
        Compute join_index and indexers to conform data structures
        to the new index.

        Parameters
        ----------
        other : Index.
        how : {'left', 'right', 'inner', 'outer'}
        return_indexers : bool, default False
        sort : bool, default False
            Sort the join keys lexicographically in the result Index. If False,
            the order of the join keys depends on the join type (how keyword).

        Returns: index

        Examples
        --------
        >>> import cudf
        >>> lhs = cudf.DataFrame(
        ...     {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b']
        ... ).index
        >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index
        >>> lhs.join(rhs, how='inner')
        MultiIndex(levels=[0    1
        1    3
        dtype: int64, 0    2
        1    4
        dtype: int64],
        codes=   a  b
        0  1  1
        1  0  0)
        """

        if isinstance(self, cudf.MultiIndex) and isinstance(
            other, cudf.MultiIndex
        ):
            raise TypeError(
                "Join on level between two MultiIndex objects is ambiguous"
            )

        if level is not None and not is_scalar(level):
            raise ValueError("level should be an int or a label only")

        if isinstance(other, cudf.MultiIndex):
            if how == "left":
                how = "right"
            elif how == "right":
                how = "left"
            rhs = self.copy(deep=False)
            lhs = other.copy(deep=False)
        else:
            lhs = self.copy(deep=False)
            rhs = other.copy(deep=False)

        on = level
        # In case of MultiIndex, it will be None as
        # we don't need to update name
        left_names = lhs.names
        right_names = rhs.names
        # There should be no `None` values in Joined indices,
        # so essentially it would be `left/right` or 'inner'
        # in case of MultiIndex
        if isinstance(lhs, cudf.MultiIndex):
            if level is not None and isinstance(level, int):
                on = lhs._data.get_by_index(level).names[0]
            right_names = (on,) or right_names
            on = right_names[0]
            if how == "outer":
                how = "left"
            elif how == "right":
                how = "inner"
        else:
            # Both are nomal indices
            right_names = left_names
            on = right_names[0]

        lhs.names = left_names
        rhs.names = right_names

        output = lhs._merge(rhs, how=how, on=on, sort=sort)

        return output
Esempio n. 26
0
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit="ns",
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : int, float, str, datetime, list, tuple, 1-d array,
        Series DataFrame/dict-like
        The object to convert to a datetime.
    errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaT.
        - If 'warn' : prints last exceptions as warnings and
            return the input.
        - If 'ignore', then invalid parsing will return the input.
    dayfirst : bool, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    format : str, default None
        The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
        See strftime documentation for more information on choices:
        https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
    unit : str, default 'ns'
        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the
        origin(unix epoch start).
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : bool, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.

    Returns
    -------
    datetime
        If parsing succeeded.
        Return type depends on input:
        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

    Examples
    --------
    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same

    >>> import cudf
    >>> df = cudf.DataFrame({'year': [2015, 2016],
    ...                    'month': [2, 3],
    ...                    'day': [4, 5]})
    >>> cudf.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]
    >>> cudf.to_datetime(1490195805, unit='s')
    numpy.datetime64('2017-03-22T15:16:45.000000000')
    >>> cudf.to_datetime(1490195805433502912, unit='ns')
    numpy.datetime64('1780-11-20T01:02:30.494253056')
    """
    if arg is None:
        return None

    if exact is False:
        raise NotImplementedError("exact support is not yet implemented")

    if origin != "unix":
        raise NotImplementedError("origin support is not yet implemented")

    if yearfirst:
        raise NotImplementedError("yearfirst support is not yet implemented")

    try:
        if isinstance(arg, cudf.DataFrame):
            # we require at least Ymd
            required = ["year", "month", "day"]
            req = list(set(required) - set(arg._data.names))
            if len(req):
                req = ",".join(req)
                raise ValueError(
                    f"to assemble mappings requires at least that "
                    f"[year, month, day] be specified: [{req}] "
                    f"is missing"
                )

            # replace passed column name with values in _unit_map
            unit = {k: get_units(k) for k in arg._data.names}
            unit_rev = {v: k for k, v in unit.items()}

            # keys we don't recognize
            excess = set(unit_rev.keys()) - set(_unit_map.values())
            if len(excess):
                excess = ",".join(excess)
                raise ValueError(
                    f"extra keys have been passed to the "
                    f"datetime assemblage: [{excess}]"
                )

            new_series = (
                arg[unit_rev["year"]].astype("str")
                + "-"
                + arg[unit_rev["month"]].astype("str").str.zfill(2)
                + "-"
                + arg[unit_rev["day"]].astype("str").str.zfill(2)
            )
            format = "%Y-%m-%d"
            col = new_series._column.as_datetime_column(
                "datetime64[s]", format=format
            )

            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    arg_col = arg._data[value]
                    if arg_col.dtype.kind in ("f"):
                        col = new_series._column.as_datetime_column(
                            "datetime64[ns]", format=format
                        )
                        break
                    elif arg_col.dtype.kind in ("O"):
                        if not cpp_is_integer(arg_col).all():
                            col = new_series._column.as_datetime_column(
                                "datetime64[ns]", format=format
                            )
                            break

            times_column = None
            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    current_col = arg._data[value]
                    # If the arg[value] is of int or
                    # float dtype we don't want to type-cast
                    if current_col.dtype.kind in ("O"):
                        try:
                            current_col = current_col.astype(dtype="int64")
                        except ValueError:
                            current_col = current_col.astype(dtype="float64")

                    factor = as_device_scalar(
                        column.datetime._numpy_to_pandas_conversion[u]
                        / (
                            column.datetime._numpy_to_pandas_conversion["s"]
                            if np.datetime_data(col.dtype)[0] == "s"
                            else 1
                        )
                    )

                    if times_column is None:
                        times_column = current_col * factor
                    else:
                        times_column = times_column + (current_col * factor)
            if times_column is not None:
                col = (col.astype(dtype="int64") + times_column).astype(
                    dtype=col.dtype
                )
            return cudf.Series(col, index=arg.index)
        elif isinstance(arg, cudf.Index):
            col = arg._values
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return as_index(col, name=arg.name)
        elif isinstance(arg, cudf.Series):
            col = arg._column
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return cudf.Series(col, index=arg.index, name=arg.name)
        else:
            col = column.as_column(arg)
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )

            if is_scalar(arg):
                return col[0]
            else:
                return as_index(col)
    except Exception as e:
        if errors == "raise":
            raise e
        elif errors == "warn":
            import traceback

            tb = traceback.format_exc()
            warnings.warn(tb)
        elif errors == "ignore":
            pass
        elif errors == "coerce":
            return np.datetime64("nat", "ns" if unit is None else unit)
        return arg
Esempio n. 27
0
def _align_by_and_df(obj, by, how="inner"):
    """
    Returns a pair of dataframes and a list may be containing
    combination of column names and Series  which are intersected
    as per their indices.

    Examples
    --------
    Dataframe and Series in the 'by' have different indices:

    >>> import cudf
    >>> import cudf.core.groupby.groupby as grp_by

    >>> gdf = cudf.DataFrame(
            {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]},
            index=[1,2,3]
        )
    >>> gsr = cudf.Series([0.0, 1.0, 2.0], name='a', index=[2,3,4])
    >>> updtd_gdf, updtd_by = grp_by._align_by_and_df(gdf, ['x', gsr])
    >>> print (gdf)
        x     y
    1 	1.0   1
    2 	2.0   2
    3 	3.0   1
    >>> print(updtd_gdf)
        x     y
    2 	2.0   2
    3 	3.0   1
    >>> print(by)
    ['x', 2    0.0
          3    1.0
          4    2.0
          Name: a, dtype: float64]
    >>> print(updtd_by)
    ['x', 2    0.0
          3    1.0
          Name: a, dtype: float64]
    """
    if not isinstance(by, (list, tuple)):
        by = [by]

    series_count = 0
    join_required = False
    series = []
    for by_col in by:
        if not is_scalar(by_col) and not isinstance(by_col, cudf.Index):
            sr = by_col
            if not isinstance(by_col, cudf.Series):
                sr = cudf.Series(by_col)
            if not join_required and not obj.index.equals(sr.index):
                join_required = True
            series.append(sr)

    new_obj = None
    if join_required:
        for sr in series:
            if new_obj is None:
                new_obj = sr.to_frame(series_count)
            else:
                new_obj = new_obj.join(
                    sr.to_frame(series_count), how=how, sort="True"
                )
            series_count += 1

    series_count = 0
    new_by = []
    if new_obj is not None:
        new_obj = new_obj.join(obj, how=how, sort="True")
        columns = new_obj.columns
        for by_col in by:
            if not is_scalar(by_col) and not isinstance(by_col, cudf.Index):
                sr, sr.name = (
                    cudf.Series(new_obj[columns[series_count]]),
                    by_col.name,
                )
                new_by.append(sr)
                series_count += 1
            else:
                new_by.append(by_col)

        new_obj = new_obj[columns[series_count::]]
    else:
        new_obj = obj
        new_by = by

    return new_obj, new_by
Esempio n. 28
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        from cudf.core import column

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = column.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column"
                    )
                key = column.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype=self.codes.dtype)
                fill_value(data, self._encode(value))
                value = build_categorical_column(
                    categories=self.dtype.categories,
                    codes=as_column(data),
                    ordered=self.dtype.ordered,
                )
            elif value is None:
                value = column.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = column.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (
                f"Size mismatch: cannot set value "
                f"of size {len(value)} to indexing result of size "
                f"{nelem}"
            )
            raise ValueError(msg)

        if is_categorical_dtype(value.dtype):
            value = value.cat().set_categories(self.categories)
            assert self.dtype == value.dtype

        if isinstance(key, slice):
            out = libcudf.copying.copy_range(
                self, value, key_start, key_stop, 0
            )
        else:
            try:
                out = libcudf.copying.scatter(value, key, self)
            except RuntimeError as e:
                if "out of bounds" in str(e):
                    raise IndexError(
                        f"index out of bounds for column of size {len(self)}"
                    )
                raise

        self._mimic_inplace(out, inplace=True)
Esempio n. 29
0
    def _getitem_tuple_arg(self, arg):
        from uuid import uuid4

        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.dataframe import DataFrame
        from cudf.core.index import as_index

        # Step 1: Gather columns
        if isinstance(arg, tuple):
            columns_df = self._get_column_selection(arg[1])
            columns_df._index = self._df._index
        else:
            columns_df = self._df

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
                if isinstance(arg, pd.MultiIndex):
                    arg = MultiIndex.from_pandas(arg)

                indices = indices_from_labels(columns_df, arg)
                return columns_df.take(indices)

            else:
                if isinstance(arg, tuple):
                    return columns_df.index._get_row_major(columns_df, arg[0])
                else:
                    return columns_df.index._get_row_major(columns_df, arg)
        else:
            if isinstance(arg[0], slice):
                out = get_label_range_or_mask(
                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
                )
                if isinstance(out, slice):
                    df = columns_df._slice(out)
                else:
                    df = columns_df._apply_boolean_mask(out)
            else:
                tmp_arg = arg
                if is_scalar(arg[0]):
                    # If a scalar, there is possibility of having duplicates.
                    # Join would get all the duplicates. So, coverting it to
                    # an array kind.
                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                if len(tmp_arg[0]) == 0:
                    return columns_df._empty_like(keep_index=True)
                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])

                if pd.api.types.is_bool_dtype(tmp_arg[0]):
                    df = columns_df._apply_boolean_mask(tmp_arg[0])
                else:
                    tmp_col_name = str(uuid4())
                    other_df = DataFrame(
                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
                        index=as_index(tmp_arg[0]),
                    )
                    df = other_df.join(columns_df, how="inner")
                    # as join is not assigning any names to index,
                    # update it over here
                    df.index.name = columns_df.index.name
                    df = df.sort_values(tmp_col_name)
                    df.drop(columns=[tmp_col_name], inplace=True)
                    # There were no indices found
                    if len(df) == 0:
                        raise KeyError(arg)

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Esempio n. 30
0
def read_csv(
    filepath_or_buffer,
    lineterminator="\n",
    quotechar='"',
    quoting=0,
    doublequote=True,
    header="infer",
    mangle_dupe_cols=True,
    usecols=None,
    sep=",",
    delimiter=None,
    delim_whitespace=False,
    skipinitialspace=False,
    names=None,
    dtype=None,
    skipfooter=0,
    skiprows=0,
    dayfirst=False,
    compression="infer",
    thousands=None,
    decimal=".",
    true_values=None,
    false_values=None,
    nrows=None,
    byte_range=None,
    skip_blank_lines=True,
    parse_dates=None,
    comment=None,
    na_values=None,
    keep_default_na=True,
    na_filter=True,
    prefix=None,
    index_col=None,
    **kwargs,
):
    """{docstring}"""

    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
        path_or_data=filepath_or_buffer,
        **kwargs,
    )
    if not is_single_filepath_or_buffer:
        raise NotImplementedError(
            "`read_csv` does not yet support reading multiple files")

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer,
        compression=compression,
        iotypes=(BytesIO, StringIO),
        **kwargs,
    )

    if na_values is not None and is_scalar(na_values):
        na_values = [na_values]

    if keep_default_na is False:
        # TODO: Remove this error once the following issue is fixed:
        # https://github.com/rapidsai/cudf/issues/6680
        raise NotImplementedError(
            "keep_default_na=False is currently not supported, please refer "
            "to: https://github.com/rapidsai/cudf/issues/6680")

    return libcudf.csv.read_csv(
        filepath_or_buffer,
        lineterminator=lineterminator,
        quotechar=quotechar,
        quoting=quoting,
        doublequote=doublequote,
        header=header,
        mangle_dupe_cols=mangle_dupe_cols,
        usecols=usecols,
        sep=sep,
        delimiter=delimiter,
        delim_whitespace=delim_whitespace,
        skipinitialspace=skipinitialspace,
        names=names,
        dtype=dtype,
        skipfooter=skipfooter,
        skiprows=skiprows,
        dayfirst=dayfirst,
        compression=compression,
        thousands=thousands,
        decimal=decimal,
        true_values=true_values,
        false_values=false_values,
        nrows=nrows,
        byte_range=byte_range,
        skip_blank_lines=skip_blank_lines,
        parse_dates=parse_dates,
        comment=comment,
        na_values=na_values,
        keep_default_na=keep_default_na,
        na_filter=na_filter,
        prefix=prefix,
        index_col=index_col,
    )