Ejemplo n.º 1
0
    def seconds(self):
        """
        Number of seconds (>= 0 and less than 1 day).

        Returns
        -------
        NumericalColumn
        """
        # This property must return the number of seconds (>= 0 and
        # less than 1 day) for each element, hence first performing
        # mod operation to remove the number of days and then performing
        # division operation to extract the number of seconds.

        return (self % as_device_scalar(
            np.timedelta64(
                _numpy_to_pandas_conversion["D"], "ns"))) // as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["s"], "ns"))
Ejemplo n.º 2
0
    def nanoseconds(self):
        """
        Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.

        Returns
        -------
        NumericalColumn
        """
        # This property must return the number of nanoseconds (>= 0 and
        # less than 1 microsecond) for each element, hence first performing
        # mod operation to remove the number of microseconds and then
        # performing division operation to extract the number
        # of nanoseconds.

        return (self % as_device_scalar(
            np.timedelta64(
                _numpy_to_pandas_conversion["us"], "ns"))) // as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns"))
Ejemplo n.º 3
0
    def days(self):
        """
        Number of days for each element.

        Returns
        -------
        NumericalColumn
        """
        return self // as_device_scalar(
            np.timedelta64(_numpy_to_pandas_conversion["D"], "ns"))
Ejemplo n.º 4
0
    def quantile(self, q, interpolation, exact):
        result = self.as_numerical.quantile(q=q,
                                            interpolation=interpolation,
                                            exact=exact)
        if isinstance(q, Number):
            return pd.Timestamp(result, unit=self.time_unit)

        result = result * as_device_scalar(
            _numpy_to_pandas_conversion[self.time_unit])

        return result.astype("datetime64[ns]")
Ejemplo n.º 5
0
    def _fill(
        self,
        fill_value: ScalarLike,
        begin: int,
        end: int,
        inplace: bool = False,
    ) -> "column.ColumnBase":
        if end <= begin or begin >= self.size:
            return self if inplace else self.copy()

        fill_code = self._encode(fill_value)
        fill_scalar = as_device_scalar(fill_code, self.codes.dtype)

        result = self if inplace else self.copy()

        libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
        return result
Ejemplo n.º 6
0
    def _binary_op_truediv(self, rhs):
        lhs, rhs = self, rhs
        if pd.api.types.is_timedelta64_dtype(rhs.dtype):
            common_dtype = determine_out_dtype(self.dtype, rhs.dtype)
            lhs = lhs.astype(common_dtype).astype("float64")

            if isinstance(rhs, (cudf.Scalar, DeviceScalar)):
                if rhs.is_valid():
                    rhs = rhs.value.astype(common_dtype).astype("float64")
                else:
                    rhs = as_device_scalar(None, "float64")
            else:
                rhs = rhs.astype(common_dtype).astype("float64")

            out_dtype = np.dtype("float64")
        elif rhs.dtype.kind in ("f", "i", "u"):
            out_dtype = self.dtype
        else:
            raise TypeError(f"Division of {self.dtype} with {rhs.dtype} "
                            f"cannot be performed.")

        return lhs, rhs, out_dtype
Ejemplo n.º 7
0
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit="ns",
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : int, float, str, datetime, list, tuple, 1-d array,
        Series DataFrame/dict-like
        The object to convert to a datetime.
    errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaT.
        - If 'warn' : prints last exceptions as warnings and
            return the input.
        - If 'ignore', then invalid parsing will return the input.
    dayfirst : bool, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    format : str, default None
        The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
        See strftime documentation for more information on choices:
        https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
    unit : str, default 'ns'
        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the
        origin(unix epoch start).
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : bool, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.

    Returns
    -------
    datetime
        If parsing succeeded.
        Return type depends on input:
        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

    Examples
    --------
    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same

    >>> import cudf
    >>> df = cudf.DataFrame({'year': [2015, 2016],
    ...                    'month': [2, 3],
    ...                    'day': [4, 5]})
    >>> cudf.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]
    >>> cudf.to_datetime(1490195805, unit='s')
    numpy.datetime64('2017-03-22T15:16:45.000000000')
    >>> cudf.to_datetime(1490195805433502912, unit='ns')
    numpy.datetime64('1780-11-20T01:02:30.494253056')
    """
    if arg is None:
        return None

    if exact is False:
        raise NotImplementedError("exact support is not yet implemented")

    if origin != "unix":
        raise NotImplementedError("origin support is not yet implemented")

    if yearfirst:
        raise NotImplementedError("yearfirst support is not yet implemented")

    try:
        if isinstance(arg, cudf.DataFrame):
            # we require at least Ymd
            required = ["year", "month", "day"]
            req = list(set(required) - set(arg._data.names))
            if len(req):
                req = ",".join(req)
                raise ValueError(
                    f"to assemble mappings requires at least that "
                    f"[year, month, day] be specified: [{req}] "
                    f"is missing"
                )

            # replace passed column name with values in _unit_map
            unit = {k: get_units(k) for k in arg._data.names}
            unit_rev = {v: k for k, v in unit.items()}

            # keys we don't recognize
            excess = set(unit_rev.keys()) - set(_unit_map.values())
            if len(excess):
                excess = ",".join(excess)
                raise ValueError(
                    f"extra keys have been passed to the "
                    f"datetime assemblage: [{excess}]"
                )

            new_series = (
                arg[unit_rev["year"]].astype("str")
                + "-"
                + arg[unit_rev["month"]].astype("str").str.zfill(2)
                + "-"
                + arg[unit_rev["day"]].astype("str").str.zfill(2)
            )
            format = "%Y-%m-%d"
            col = new_series._column.as_datetime_column(
                "datetime64[s]", format=format
            )

            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    arg_col = arg._data[value]
                    if arg_col.dtype.kind in ("f"):
                        col = new_series._column.as_datetime_column(
                            "datetime64[ns]", format=format
                        )
                        break
                    elif arg_col.dtype.kind in ("O"):
                        if not cpp_is_integer(arg_col).all():
                            col = new_series._column.as_datetime_column(
                                "datetime64[ns]", format=format
                            )
                            break

            times_column = None
            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    current_col = arg._data[value]
                    # If the arg[value] is of int or
                    # float dtype we don't want to type-cast
                    if current_col.dtype.kind in ("O"):
                        try:
                            current_col = current_col.astype(dtype="int64")
                        except ValueError:
                            current_col = current_col.astype(dtype="float64")

                    factor = as_device_scalar(
                        column.datetime._numpy_to_pandas_conversion[u]
                        / (
                            column.datetime._numpy_to_pandas_conversion["s"]
                            if np.datetime_data(col.dtype)[0] == "s"
                            else 1
                        )
                    )

                    if times_column is None:
                        times_column = current_col * factor
                    else:
                        times_column = times_column + (current_col * factor)
            if times_column is not None:
                col = (col.astype(dtype="int64") + times_column).astype(
                    dtype=col.dtype
                )
            return cudf.Series(col, index=arg.index)
        elif isinstance(arg, cudf.Index):
            col = arg._values
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return as_index(col, name=arg.name)
        elif isinstance(arg, cudf.Series):
            col = arg._column
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return cudf.Series(col, index=arg.index, name=arg.name)
        else:
            col = column.as_column(arg)
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )

            if is_scalar(arg):
                return col[0]
            else:
                return as_index(col)
    except Exception as e:
        if errors == "raise":
            raise e
        elif errors == "warn":
            import traceback

            tb = traceback.format_exc()
            warnings.warn(tb)
        elif errors == "ignore":
            pass
        elif errors == "coerce":
            return np.datetime64("nat", "ns" if unit is None else unit)
        return arg
Ejemplo n.º 8
0
def _process_col(col, unit, dayfirst, infer_datetime_format, format):
    if col.dtype.kind == "M":
        return col
    elif col.dtype.kind == "m":
        raise TypeError(
            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
        )

    if col.dtype.kind in ("f"):
        if unit not in (None, "ns"):
            factor = as_device_scalar(
                column.datetime._numpy_to_pandas_conversion[unit]
            )
            col = col * factor

        if format is not None:
            # Converting to int because,
            # pandas actually creates a datetime column
            # out of float values and then creates an
            # int column out of it to parse against `format`.
            # Instead we directly cast to int and perform
            # parsing against `format`.
            col = (
                col.astype("int")
                .astype("str")
                .as_datetime_column(
                    dtype="datetime64[us]"
                    if "%f" in format
                    else "datetime64[s]",
                    format=format,
                )
            )
        else:
            col = col.as_datetime_column(dtype="datetime64[ns]")

    if col.dtype.kind in ("i"):
        if unit in ("D", "h", "m"):
            factor = as_device_scalar(
                column.datetime._numpy_to_pandas_conversion[unit]
                / column.datetime._numpy_to_pandas_conversion["s"]
            )
            col = col * factor

        if format is not None:
            col = col.astype("str").as_datetime_column(
                dtype=_unit_dtype_map[unit], format=format
            )
        else:
            col = col.as_datetime_column(dtype=_unit_dtype_map[unit])

    elif col.dtype.kind in ("O"):
        if unit not in (None, "ns"):
            try:
                col = col.astype(dtype="int64")
            except ValueError:
                col = col.astype(dtype="float64")
            return _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
        else:
            if infer_datetime_format and format is None:
                format = column.datetime.infer_format(
                    element=col[0], dayfirst=dayfirst,
                )
            elif format is None:
                format = column.datetime.infer_format(element=col[0])
            col = col.as_datetime_column(
                dtype=_unit_dtype_map[unit], format=format,
            )
    return col
Ejemplo n.º 9
0
    def components(self, index=None):
        """
        Return a Dataframe of the components of the Timedeltas.

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s'))
        >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656,
        ...     3244334234], dtype='timedelta64[ms]')
        >>> s
        0      141 days 13:35:12.123
        1       14 days 06:00:31.231
        2    13000 days 10:12:48.712
        3        0 days 00:35:35.656
        4       37 days 13:12:14.234
        dtype: timedelta64[ms]
        >>> s.dt.components
            days  hours  minutes  seconds  milliseconds  microseconds  nanoseconds
        0    141     13       35       12           123             0            0
        1     14      6        0       31           231             0            0
        2  13000     10       12       48           712             0            0
        3      0      0       35       35           656             0            0
        4     37     13       12       14           234             0            0
        """  # noqa: E501

        return cudf.DataFrame(
            data={
                "days":
                self // as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["D"], "ns")),
                "hours": (self % as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["D"], "ns"))) //
                as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["h"], "ns")),
                "minutes": (self % as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["h"], "ns"))) //
                as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["m"], "ns")),
                "seconds": (self % as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["m"], "ns"))) //
                as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["s"], "ns")),
                "milliseconds": (self % as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["s"], "ns"))) //
                as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns")),
                "microseconds": (self % as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["ms"], "ns")))
                // as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")),
                "nanoseconds": (self % as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["us"], "ns")))
                // as_device_scalar(
                    np.timedelta64(_numpy_to_pandas_conversion["ns"], "ns")),
            },
            index=index,
        )