Ejemplo n.º 1
0
    def from_numpy_dtype(self, dt):
        """
        From Numpy dtype.

        >>> from datashape import CType
        >>> from numpy import dtype
        >>> CType.from_numpy_dtype(dtype('int32'))
        ctype("int32")
        >>> CType.from_numpy_dtype(dtype('i8'))
        ctype("int64")
        >>> CType.from_numpy_dtype(dtype('M8'))
        DateTime(None)
        >>> CType.from_numpy_dtype(dtype('U30'))
        ctype("string[30, 'U32']")
        """
        try:
            return Type.lookup_type(dt.name)
        except KeyError:
            pass
        if np.issubdtype(dt, np.datetime64):
            unit, _ = np.datetime_data(dt)
            defaults = {'D': date_, 'Y': date_, 'M': date_, 'W': date_}
            return defaults.get(unit, datetime_)
        elif np.issubdtype(dt, np.timedelta64):
            unit, _ = np.datetime_data(dt)
            return TimeDelta(unit=unit)
        elif np.issubdtype(dt, np.unicode_):
            return String(dt.itemsize // 4, 'U32')
        elif np.issubdtype(dt, np.str_) or np.issubdtype(dt, np.bytes_):
            return String(dt.itemsize, 'ascii')
        raise NotImplementedError("NumPy datatype %s not supported" % dt)
Ejemplo n.º 2
0
def _validate_date_like_dtype(dtype):
    try:
        typ = np.datetime_data(dtype)[0]
    except ValueError as e:
        raise TypeError('%s' % e)
    if typ != 'generic' and typ != 'ns':
        raise ValueError('%r is too specific of a frequency, try passing %r' %
                         (dtype.name, dtype.type.__name__))
Ejemplo n.º 3
0
def _datetime_metadata_str(dtype):
    # TODO: this duplicates the C append_metastr_to_string
    unit, count = np.datetime_data(dtype)
    if unit == 'generic':
        return ''
    elif count == 1:
        return '[{}]'.format(unit)
    else:
        return '[{}{}]'.format(count, unit)
Ejemplo n.º 4
0
def series_to_array(s, dshape=None, **kwargs):
    dtype = dshape_to_numpy(datashape.dshape(dshape))
    sdtype = s.dtype
    values = s.values

    # don't lose precision of datetime64 more precise than microseconds
    if ((issubclass(sdtype.type, np.datetime64) and
            np.datetime_data(sdtype)[0] in higher_precision_freqs)
            or s.dtype == dtype):
        return values
    try:
        return values.astype(dtype)
    except ValueError:  # object series and record dshape, e.g., a frame row
        return values
Ejemplo n.º 5
0
def _validate_date_like_dtype(dtype):
    """
    Check whether the dtype is a date-like dtype. Raises an error if invalid.

    Parameters
    ----------
    dtype : dtype, type
        The dtype to check.

    Raises
    ------
    TypeError : The dtype could not be casted to a date-like dtype.
    ValueError : The dtype is an illegal date-like dtype (e.g. the
                 the frequency provided is too specific)
    """

    try:
        typ = np.datetime_data(dtype)[0]
    except ValueError as e:
        raise TypeError('%s' % e)
    if typ != 'generic' and typ != 'ns':
        raise ValueError('%r is too specific of a frequency, try passing %r' %
                         (dtype.name, dtype.type.__name__))
Ejemplo n.º 6
0
def _validate_date_like_dtype(dtype) -> None:
    """
    Check whether the dtype is a date-like dtype. Raises an error if invalid.

    Parameters
    ----------
    dtype : dtype, type
        The dtype to check.

    Raises
    ------
    TypeError : The dtype could not be casted to a date-like dtype.
    ValueError : The dtype is an illegal date-like dtype (e.g. the
                 frequency provided is too specific)
    """
    try:
        typ = np.datetime_data(dtype)[0]
    except ValueError as e:
        raise TypeError(e) from e
    if typ not in ["generic", "ns"]:
        raise ValueError(
            f"{repr(dtype.name)} is too specific of a frequency, "
            f"try passing {repr(dtype.type.__name__)}"
        )
Ejemplo n.º 7
0
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit="ns",
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):
    """
    Convert argument to datetime.

    Parameters
    ----------
    arg : int, float, str, datetime, list, tuple, 1-d array,
        Series DataFrame/dict-like
        The object to convert to a datetime.
    errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaT.
        - If 'warn' : prints last exceptions as warnings and
            return the input.
        - If 'ignore', then invalid parsing will return the input.
    dayfirst : bool, default False
        Specify a date parse order if `arg` is str or its list-likes.
        If True, parses dates with the day first, eg 10/11/12 is parsed as
        2012-11-10.
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug, based on dateutil behavior).
    format : str, default None
        The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
        all the way up to nanoseconds.
        See strftime documentation for more information on choices:
        https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
    unit : str, default 'ns'
        The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
        integer or float number. This will be based off the
        origin(unix epoch start).
        Example, with unit='ms' and origin='unix' (the default), this
        would calculate the number of milliseconds to the unix epoch start.
    infer_datetime_format : bool, default False
        If True and no `format` is given, attempt to infer the format of the
        datetime strings, and if it can be inferred, switch to a faster
        method of parsing them. In some cases this can increase the parsing
        speed by ~5-10x.

    Returns
    -------
    datetime
        If parsing succeeded.
        Return type depends on input:
        - list-like: DatetimeIndex
        - Series: Series of datetime64 dtype
        - scalar: Timestamp

    Examples
    --------
    Assembling a datetime from multiple columns of a DataFrame. The keys can be
    common abbreviations like ['year', 'month', 'day', 'minute', 'second',
    'ms', 'us', 'ns']) or plurals of the same

    >>> import cudf
    >>> df = cudf.DataFrame({'year': [2015, 2016],
    ...                    'month': [2, 3],
    ...                    'day': [4, 5]})
    >>> cudf.to_datetime(df)
    0   2015-02-04
    1   2016-03-05
    dtype: datetime64[ns]
    >>> cudf.to_datetime(1490195805, unit='s')
    numpy.datetime64('2017-03-22T15:16:45.000000000')
    >>> cudf.to_datetime(1490195805433502912, unit='ns')
    numpy.datetime64('1780-11-20T01:02:30.494253056')
    """
    if arg is None:
        return None

    if exact is False:
        raise NotImplementedError("exact support is not yet implemented")

    if origin != "unix":
        raise NotImplementedError("origin support is not yet implemented")

    if yearfirst:
        raise NotImplementedError("yearfirst support is not yet implemented")

    try:
        if isinstance(arg, cudf.DataFrame):
            # we require at least Ymd
            required = ["year", "month", "day"]
            req = list(set(required) - set(arg._data.names))
            if len(req):
                req = ",".join(req)
                raise ValueError(
                    f"to assemble mappings requires at least that "
                    f"[year, month, day] be specified: [{req}] "
                    f"is missing"
                )

            # replace passed column name with values in _unit_map
            unit = {k: get_units(k) for k in arg._data.names}
            unit_rev = {v: k for k, v in unit.items()}

            # keys we don't recognize
            excess = set(unit_rev.keys()) - set(_unit_map.values())
            if len(excess):
                excess = ",".join(excess)
                raise ValueError(
                    f"extra keys have been passed to the "
                    f"datetime assemblage: [{excess}]"
                )

            new_series = (
                arg[unit_rev["year"]].astype("str")
                + "-"
                + arg[unit_rev["month"]].astype("str").str.zfill(2)
                + "-"
                + arg[unit_rev["day"]].astype("str").str.zfill(2)
            )
            format = "%Y-%m-%d"
            col = new_series._column.as_datetime_column(
                "datetime64[s]", format=format
            )

            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    arg_col = arg._data[value]
                    if arg_col.dtype.kind in ("f"):
                        col = new_series._column.as_datetime_column(
                            "datetime64[ns]", format=format
                        )
                        break
                    elif arg_col.dtype.kind in ("O"):
                        if not cpp_is_integer(arg_col).all():
                            col = new_series._column.as_datetime_column(
                                "datetime64[ns]", format=format
                            )
                            break

            times_column = None
            for u in ["h", "m", "s", "ms", "us", "ns"]:
                value = unit_rev.get(u)
                if value is not None and value in arg:
                    current_col = arg._data[value]
                    # If the arg[value] is of int or
                    # float dtype we don't want to type-cast
                    if current_col.dtype.kind in ("O"):
                        try:
                            current_col = current_col.astype(dtype="int64")
                        except ValueError:
                            current_col = current_col.astype(dtype="float64")

                    factor = as_device_scalar(
                        column.datetime._numpy_to_pandas_conversion[u]
                        / (
                            column.datetime._numpy_to_pandas_conversion["s"]
                            if np.datetime_data(col.dtype)[0] == "s"
                            else 1
                        )
                    )

                    if times_column is None:
                        times_column = current_col * factor
                    else:
                        times_column = times_column + (current_col * factor)
            if times_column is not None:
                col = (col.astype(dtype="int64") + times_column).astype(
                    dtype=col.dtype
                )
            return cudf.Series(col, index=arg.index)
        elif isinstance(arg, cudf.Index):
            col = arg._values
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return as_index(col, name=arg.name)
        elif isinstance(arg, cudf.Series):
            col = arg._column
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )
            return cudf.Series(col, index=arg.index, name=arg.name)
        else:
            col = column.as_column(arg)
            col = _process_col(
                col=col,
                unit=unit,
                dayfirst=dayfirst,
                infer_datetime_format=infer_datetime_format,
                format=format,
            )

            if is_scalar(arg):
                return col[0]
            else:
                return as_index(col)
    except Exception as e:
        if errors == "raise":
            raise e
        elif errors == "warn":
            import traceback

            tb = traceback.format_exc()
            warnings.warn(tb)
        elif errors == "ignore":
            pass
        elif errors == "coerce":
            return np.datetime64("nat", "ns" if unit is None else unit)
        return arg
Ejemplo n.º 8
0
    def append(self,
               table: str,
               data: Mapping[str, np.ndarray],
               schema: str = 'sys') -> None:
        """
        Directly append an array structure
        """
        self._switch()
        n_columns = len(data)
        existing_columns = list(self.get_columns(schema=schema, table=table))
        existing_names, existing_types = zip(*existing_columns)
        if not set(existing_names) == set(data.keys()):
            error = f"Appended column names ({', '.join(str(i) for i in data.keys())}) " \
                    f"don't match existing column names ({', '.join(existing_names)})"
            raise exceptions.ProgrammingError(error)

        work_columns = ffi.new(f'monetdbe_column * [{n_columns}]')
        work_objs = []
        # cffi_objects assists to keep all in-memory native data structure alive during the execution of this call
        cffi_objects = list()
        for column_num, (column_name,
                         existing_type) in enumerate(existing_columns):
            column_values = data[column_name]
            work_column = ffi.new('monetdbe_column *')
            type_info = numpy_monetdb_map(column_values.dtype)

            # try to convert the values if types don't match
            if type_info.c_type != existing_type:
                if type_info.c_type == lib.monetdbe_timestamp and existing_type == lib.monetdbe_date and np.issubdtype(
                        column_values.dtype, np.datetime64):
                    """
                    We are going to cast to a monetdbe_date and
                    consider monetdbe_timestamp as a 'base type' to signal this.
                    """
                    type_info = timestamp_to_date()
                else:
                    precision_warning(type_info.c_type, existing_type)
                    to_numpy_type = monet_c_type_map[existing_type].numpy_type
                    try:
                        column_values = column_values.astype(to_numpy_type)
                        type_info = numpy_monetdb_map(column_values.dtype)
                    except Exception as e:
                        existing_type_string = monet_c_type_map[
                            existing_type].c_string_type
                        error = f"Can't convert '{type_info.c_string_type}' " \
                                f"to type '{existing_type_string}' for column '{column_name}': {e} "
                        raise ValueError(error)

            work_column.type = type_info.c_type
            work_column.count = column_values.shape[0]
            work_column.name = ffi.new('char[]', column_name.encode())
            if type_info.numpy_type.kind == 'M':
                t = ffi.new('monetdbe_data_timestamp[]', work_column.count)
                cffi_objects.append(t)
                unit = np.datetime_data(column_values.dtype)[0].encode()
                p = ffi.from_buffer("int64_t*", column_values)

                lib.initialize_timestamp_array_from_numpy(
                    self._monetdbe_database, t, work_column.count, p, unit,
                    existing_type)
                work_column.data = t
            elif type_info.numpy_type.kind == 'U':
                # first massage the numpy array of unicode into a matrix of null terminated rows of bytes.
                m = ffi.from_buffer(
                    "bool*", column_values.mask) if np.ma.isMaskedArray(
                        column_values) else 0  # type: ignore[attr-defined]
                cffi_objects.append(m)
                v = np.char.encode(column_values).view('b').reshape(
                    (work_column.count, -1))
                v = np.c_[v, np.zeros(work_column.count, dtype=np.int8)]
                stride_length = v.shape[1]
                cffi_objects.append(v)
                t = ffi.new('char*[]', work_column.count)
                cffi_objects.append(t)
                p = ffi.from_buffer("char*", v)
                cffi_objects.append(p)
                lib.initialize_string_array_from_numpy(t, work_column.count, p,
                                                       stride_length,
                                                       ffi.cast("bool*", m))
                work_column.data = t
            else:
                p = ffi.from_buffer(f"{type_info.c_string_type}*",
                                    column_values)
                cffi_objects.append(p)
                work_column.data = p
            work_columns[column_num] = work_column
            work_objs.append(work_column)
        check_error(
            lib.monetdbe_append(self._monetdbe_database, schema.encode(),
                                table.encode(), work_columns, n_columns))
Ejemplo n.º 9
0
np.can_cast(AR_f8, 1)  # E: incompatible type

np.vdot(AR_M, AR_M)  # E: incompatible type

np.copyto(AR_LIKE_f, AR_f8)  # E: incompatible type

np.putmask(AR_LIKE_f, [True, True, False], 1.5)  # E: incompatible type

np.packbits(AR_f8)  # E: incompatible type
np.packbits(AR_u1, bitorder=">")  # E: incompatible type

np.unpackbits(AR_i8)  # E: incompatible type
np.unpackbits(AR_u1, bitorder=">")  # E: incompatible type

np.shares_memory(1, 1, max_work=i8)  # E: incompatible type
np.may_share_memory(1, 1, max_work=i8)  # E: incompatible type

np.arange(M)  # E: No overload variant
np.arange(stop=10)  # E: No overload variant

np.datetime_data(int)  # E: incompatible type

np.busday_offset("2012", 10)  # E: incompatible type

np.datetime_as_string("2012")  # E: incompatible type

np.compare_chararrays("a", b"a", "==", False)  # E: No overload variant

np.add_docstring(func, None)  # E: incompatible type
Ejemplo n.º 10
0
reveal_type(np.shares_memory(1, 2))  # E: bool
reveal_type(np.shares_memory(AR_f8, AR_f8, max_work=1))  # E: bool

reveal_type(np.may_share_memory(1, 2))  # E: bool
reveal_type(np.may_share_memory(AR_f8, AR_f8, max_work=1))  # E: bool

reveal_type(np.geterrobj())  # E: list[Any]

reveal_type(np.seterrobj([8192, 521, None]))  # E: None

reveal_type(np.promote_types(np.int32, np.int64))  # E: numpy.dtype[Any]
reveal_type(np.promote_types("f4", float))  # E: numpy.dtype[Any]

reveal_type(np.frompyfunc(func, 1, 1, identity=None))  # numpy.ufunc

reveal_type(np.datetime_data("m8[D]"))  # E: Tuple[builtins.str, builtins.int]
reveal_type(np.datetime_data(
    np.datetime64))  # E: Tuple[builtins.str, builtins.int]
reveal_type(np.datetime_data(np.dtype(
    np.timedelta64)))  # E: Tuple[builtins.str, builtins.int]

reveal_type(np.busday_count("2011-01", "2011-02"))  # E: {int_}
reveal_type(np.busday_count(
    ["2011-01"], "2011-02"))  # E: numpy.ndarray[Any, numpy.dtype[{int_}]]

reveal_type(np.busday_offset(M, m))  # E: numpy.datetime64
reveal_type(np.busday_offset(M, 5))  # E: numpy.datetime64
reveal_type(np.busday_offset(
    AR_M, m))  # E: numpy.ndarray[Any, numpy.dtype[numpy.datetime64]]
reveal_type(np.busday_offset("2011-01", "2011-02",
                             roll="forward"))  # E: numpy.datetime64
Ejemplo n.º 11
0
# In[47]:

import numpy as np

# In[48]:

nd = np.datetime64('2015-10-31')
nd

# In[49]:

np.datetime_as_string(nd)

# In[50]:

np.datetime_data(nd)

# In[51]:

d

# In[52]:

nd = np.datetime64(d)
nd

# In[53]:

nd.astype(dt.datetime)

# In[54]:
Ejemplo n.º 12
0
def h5_velocity_by_intervals_gen(
        cfg: Mapping[str, Any],
        cfg_out: Mapping[str, Any]) -> Iterator[Tuple[str, Tuple[Any, ...]]]:
    """
    Loads data and calculates velocity: many intervals from many of hdf5 tables sequentially.
    :param cfg: dict with fields:
        ['proc']['dt_interval'] - numpy.timedelta64 time interval of loading data
        one group of fields:
            1.  'split_period', pandas interval str, as required by intervals_from_period() to cover all data by it
                'overlap'

            2.  'time_intervals_start' - manually specified starts of intercals

    :param cfg_out: fields must be provided:
        - see h5_names_gen(cfg_in, cfg_out) requirements
    :return:
    """
    # Prepare cycle
    if cfg_out.get('split_period'):

        def gen_loaded(tbl):
            """
            Variant 1. Generate regular intervals (may be with overlap)
            :param tbl:
            :return:
            """
            cfg['in']['table'] = tbl
            # To obtain ``t_intervals_start`` used in query inside gen_data_on_intervals(cfg_out, cfg)
            # we copy its content here:
            t_prev_interval_start, t_intervals_start = intervals_from_period(
                **cfg['in'], period=cfg_out['split_period'])
            if cfg['proc']['overlap']:
                dt_shifts = np.arange(
                    0, 1,
                    (1 - cfg['proc']['overlap'])) * pd_period_to_timedelta(
                        cfg_out['split_period'])
                t_intervals_start = (t_intervals_start.to_numpy(
                    dtype="datetime64[ns]")[np.newaxis].T +
                                     dt_shifts).flatten()
                if cfg['in']['max_date']:
                    idel = t_intervals_start.searchsorted(
                        np.datetime64(
                            cfg['in']['max_date'] -
                            pd_period_to_timedelta(cfg_out['split_period'])))
                    t_intervals_start = t_intervals_start[:idel]
                cfg['in'][
                    'time_intervals_start'] = t_intervals_start  # to save queried time - see main()
            cfg_filter = None
            cfg_in_columns_saved = cfg['in']['columns']
            for start_end in h5q_starts2coord(
                    cfg['in']['db_path'],
                    cfg['in']['table'],
                    t_intervals_start,
                    dt_interval=cfg['proc']['dt_interval']):
                a = h5_load_range_by_coord(**cfg['in'],
                                           range_coordinates=start_end)
                if cfg_filter is None:  # only 1 time
                    # corrects columns if they are not exact mutch to faster h5_load_range_by_coord() next time
                    cfg['in']['columns'] = a.columns  # temporary
                    # and exclude absent fields to not filter warning of no such column in filt_data_dd()
                    detect_filt = f"m(ax|in)_({'|'.join(cfg['in']['columns'])})"
                    cfg_filter = {
                        k: v
                        for k, v in cfg['filter'].items()
                        if re.match(detect_filt, k)
                    }
                d, i_burst = filt_data_dd(a, cfg['in']['dt_between_bursts'],
                                          cfg['in']['dt_hole_warning'],
                                          cfg_filter)

                n_bursts = len(i_burst)
                if n_bursts > 1:  # 1st is always 0
                    l.info('gaps found: (%s)! at %s', n_bursts - 1,
                           i_burst[1:] - 1)
                df0 = d.compute()
                if not len(df0):
                    continue
                start_end = df0.index[[0, -1]].values
                yield df0, start_end
            cfg['in'][
                'columns'] = cfg_in_columns_saved  # recover to not affect next file

    else:
        query_range_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')"

        def gen_loaded(tbl):
            """
            Variant 2. Generate intervals at specified start values with same width cfg['proc']['dt_interval']
            :param tbl:
            :return:
            """
            for start_end in zip(
                    cfg['in']['time_intervals_start'],
                    cfg['in']['time_intervals_start'] +
                    cfg['proc']['dt_interval']):
                query_range_lims = pd.to_datetime(start_end)
                qstr = query_range_pattern.format(*query_range_lims)
                l.info(f'query:\n%s... ', qstr)
                df0 = store.select(tbl, where=qstr, columns=None)
                yield df0, start_end

    dt_interval_in_its_units = cfg['proc']['dt_interval'].astype(int)
    dt_interval_units = np.datetime_data(cfg['proc']['dt_interval'])[0]
    data_name_suffix = f'{dt_interval_in_its_units}{dt_interval_units}'

    # Cycle
    with pd.HDFStore(cfg['in']['db_path'], mode='r') as store:
        for (tbl, coefs) in h5_names_gen(cfg['in'], cfg_out):
            # Get data in ranges
            for df0, start_end in gen_loaded(tbl):
                if cfg['in']['db_path'].stem.endswith('proc_noAvg'):
                    df = df0
                else:  # loading source data needed to be processed to calc velocity
                    df0 = filter_local(df0, cfg['filter'])
                    df = incl_calc_velocity_nodask(df0,
                                                   **coefs,
                                                   cfg_filter=cfg['in'],
                                                   cfg_proc=cfg['proc'])

                data_name = f'{tbl}/PSD_{start_end[0]}{data_name_suffix}'
                yield (df, tbl, data_name)
Ejemplo n.º 13
0
    def loc_to_iloc(
        cls,
        *,
        label_to_pos: tp.Dict[tp.Hashable, int],
        labels: np.ndarray,
        positions: np.ndarray,
        key: GetItemKeyType,
        offset: tp.Optional[int] = None,
        partial_selection: bool = False,
    ) -> GetItemKeyType:
        '''
        Note: all SF objects (Series, Index) need to be converted to basic types before being passed as `key` to this function.

        Args:
            offset: in the context of an IndexHierarchical, the iloc positions returned from this funcition need to be shifted.
            partial_selection: if True and key is an iterable of labels that includes labels not in the mapping, available matches will be returned rather than raising.
        Returns:
            An integer mapped slice, or GetItemKey type that is based on integers, compatible with TypeBlocks
        '''
        # NOTE: ILoc is handled prior to this call, in the Index._loc_to_iloc method
        offset_apply = not offset is None

        if key.__class__ is slice:
            if key == NULL_SLICE:
                if offset_apply:
                    # when offset is defined (even if it is zero), null slice is not sufficiently specific; need to convert to an explicit slice relative to the offset
                    return slice(offset,
                                 len(positions) + offset)  #type: ignore
                else:
                    return NULL_SLICE
            try:
                return slice(*cls.map_slice_args(
                    label_to_pos.get,  #type: ignore
                    key,
                    labels,
                    offset))
            except LocEmpty:
                return EMPTY_SLICE

        labels_is_dt64 = labels.dtype.kind == DTYPE_DATETIME_KIND

        if key.__class__ is np.datetime64:
            # if we have a single dt64, convert this to the key's unit and do a Boolean selection if the key is a less-granular unit
            if (labels.dtype == DTYPE_OBJECT and np.datetime_data(key.dtype)[0]
                    in DTYPE_OBJECTABLE_DT64_UNITS):  #type: ignore
                key = key.astype(DTYPE_OBJECT)  #type: ignore
            elif labels_is_dt64 and key.dtype < labels.dtype:  #type: ignore
                key = labels.astype(key.dtype) == key  #type: ignore
            # if not different type, keep it the same so as to do a direct, single element selection

        is_array = key.__class__ is np.ndarray
        is_list = isinstance(key, list)

        # can be an iterable of labels (keys) or an iterable of Booleans
        if is_array or is_list:
            if is_array and key.dtype.kind == DTYPE_DATETIME_KIND:  #type: ignore
                if (labels.dtype == DTYPE_OBJECT
                        and np.datetime_data(key.dtype)[0]
                        in DTYPE_OBJECTABLE_DT64_UNITS):  #type: ignore
                    # if key is dt64 and labels are object, then for objectable units we can convert key to object to permit matching in the AutoMap
                    # NOTE: tolist() is expected to be faster than astype object for smaller collections
                    key = key.tolist()  #type: ignore
                    is_array = False
                    is_list = True
                elif labels_is_dt64 and key.dtype < labels.dtype:  #type: ignore
                    # change the labels to the dt64 dtype, i.e., if the key is years, recast the labels as years, and do a Boolean selection of everything that matches each key
                    labels_ref = labels.astype(key.dtype)  # type: ignore
                    # NOTE: this is only correct if both key and labels are dt64, and key is a less granular unit, as the order in the key and will not be used
                    # let Boolean key advance to next branch
                    key = reduce(OPERATORS['__or__'],
                                 (labels_ref == k
                                  for k in key))  # type: ignore

            if is_array and key.dtype == DTYPE_BOOL:  #type: ignore
                if offset_apply:
                    return positions[key] + offset
                return positions[key]

            # map labels to integer positions, return a list of integer positions
            # NOTE: we may miss the opportunity to identify contiguous keys and extract a slice
            # NOTE: we do more branching here to optimize performance
            if partial_selection:
                if offset_apply:
                    return [
                        label_to_pos[k] + offset for k in key
                        if k in label_to_pos
                    ]  #type: ignore
                return [label_to_pos[k] for k in key
                        if k in label_to_pos]  # type: ignore
            if offset_apply:
                return [label_to_pos[k] + offset for k in key]  #type: ignore
            return [label_to_pos[k] for k in key]  # type: ignore

        # if a single element (an integer, string, or date, we just get the integer out of the map
        if offset_apply:
            return label_to_pos[key] + offset  #type: ignore
        return label_to_pos[key]  #type: ignore
Ejemplo n.º 14
0
    def _preprocess_host_value(self, value, dtype):
        valid = not cudf._lib.scalar._is_null_host_scalar(value)

        if isinstance(value, list):
            if dtype is not None:
                raise TypeError("Lists may not be cast to a different dtype")
            else:
                dtype = ListDtype.from_arrow(
                    pa.infer_type([value], from_pandas=True)
                )
                return value, dtype
        elif isinstance(dtype, ListDtype):
            if value not in {None, NA}:
                raise ValueError(f"Can not coerce {value} to ListDtype")
            else:
                return NA, dtype

        if isinstance(value, dict):
            if dtype is None:
                dtype = StructDtype.from_arrow(
                    pa.infer_type([value], from_pandas=True)
                )
            return value, dtype
        elif isinstance(dtype, StructDtype):
            if value not in {None, NA}:
                raise ValueError(f"Can not coerce {value} to StructDType")
            else:
                return NA, dtype

        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
            value = pa.scalar(
                value, type=pa.decimal128(dtype.precision, dtype.scale)
            ).as_py()
        if isinstance(value, decimal.Decimal) and dtype is None:
            dtype = cudf.Decimal128Dtype._from_decimal(value)

        value = to_cudf_compatible_scalar(value, dtype=dtype)

        if dtype is None:
            if not valid:
                if isinstance(value, (np.datetime64, np.timedelta64)):
                    unit, _ = np.datetime_data(value)
                    if unit == "generic":
                        raise TypeError(
                            "Cant convert generic NaT to null scalar"
                        )
                    else:
                        dtype = value.dtype
                else:
                    raise TypeError(
                        "dtype required when constructing a null scalar"
                    )
            else:
                dtype = value.dtype

        if not isinstance(dtype, cudf.core.dtypes.DecimalDtype):
            dtype = cudf.dtype(dtype)

        if not valid:
            value = NA

        return value, dtype
Ejemplo n.º 15
0
def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
    """Create a Column from an arbitrary object

    Parameters
    ----------
    arbitrary : object
        Object to construct the Column from. See *Notes*.
    nan_as_null : bool, optional, default None
        If None (default), treats NaN values in arbitrary as null if there is
        no mask passed along with it. If True, combines the mask and NaNs to
        form a new validity mask. If False, leaves NaN values as is.
    dtype : optional
        Optionally typecast the construted Column to the given
        dtype.
    length : int, optional
        If `arbitrary` is a scalar, broadcast into a Column of
        the given length.

    Returns
    -------
    A Column of the appropriate type and size.

    Notes
    -----
    Currently support inputs are:

    * ``Column``
    * ``Series``
    * ``Index``
    * Scalars (can be broadcasted to a specified `length`)
    * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays)
    * Objects exposing ``__array_interface__``(e.g., numpy arrays)
    * pyarrow array
    * pandas.Categorical objects
    """

    from cudf.core.column import numerical, categorical, datetime, string
    from cudf.core.series import Series
    from cudf.core.index import Index

    if isinstance(arbitrary, ColumnBase):
        if dtype is not None:
            return arbitrary.astype(dtype)
        else:
            return arbitrary

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    # TODO: Remove nvstrings here when nvstrings is fully removed
    elif isinstance(arbitrary, nvstrings.nvstrings):
        byte_count = arbitrary.byte_count()
        if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES:
            raise MemoryError("Cannot construct string columns "
                              "containing > {} bytes. "
                              "Consider using dask_cudf to partition "
                              "your data.".format(
                                  libcudfxx.MAX_STRING_COLUMN_BYTES_STR))
        sbuf = Buffer.empty(arbitrary.byte_count())
        obuf = Buffer.empty(
            (arbitrary.size() + 1) * np.dtype("int32").itemsize)

        nbuf = None
        if arbitrary.null_count() > 0:
            nbuf = create_null_mask(arbitrary.size(),
                                    state=MaskState.UNINITIALIZED)
            arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True)
        arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True)
        children = (
            build_column(obuf, dtype="int32"),
            build_column(sbuf, dtype="int8"),
        )
        data = build_column(data=None,
                            dtype="object",
                            mask=nbuf,
                            children=children)
        data._nvstrings = arbitrary

    elif isinstance(arbitrary, Buffer):
        if dtype is None:
            raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer")
        data = build_column(arbitrary, dtype=dtype)

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        desc = arbitrary.__cuda_array_interface__
        dtype = np.dtype(desc["typestr"])
        data = _data_from_cuda_array_interface_desc(arbitrary)
        mask = _mask_from_cuda_array_interface_desc(arbitrary)
        col = build_column(data, dtype=dtype, mask=mask)
        if np.issubdtype(col.dtype, np.floating):
            if nan_as_null or (mask is None and nan_as_null is None):
                mask = libcudfxx.transform.nans_to_nulls(col.fillna(np.nan))
                col = col.set_mask(mask)
        elif np.issubdtype(col.dtype, np.datetime64):
            if nan_as_null or (mask is None and nan_as_null is None):
                col = utils.time_col_replace_nulls(col)
        return col

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow(
                arbitrary)
            children = (
                build_column(data=obuf, dtype="int32"),
                build_column(data=sbuf, dtype="int8"),
            )

            data = string.StringColumn(mask=nbuf,
                                       children=children,
                                       size=pa_size,
                                       offset=pa_offset)

        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype())

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary), ), dtype=new_dtype)
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary), ), dtype=new_dtype)
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            codes = as_column(arbitrary.indices)
            if isinstance(arbitrary.dictionary, pa.NullArray):
                categories = as_column([], dtype="object")
            else:
                categories = as_column(arbitrary.dictionary)
            dtype = CategoricalDtype(categories=categories,
                                     ordered=arbitrary.type.ordered)
            data = categorical.CategoricalColumn(
                dtype=dtype,
                mask=codes.base_mask,
                children=(codes, ),
                size=codes.size,
                offset=codes.offset,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype)

            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            raise NotImplementedError
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype="M8[ms]")
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=np.dtype("M8[ms]"),
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())

            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype)
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        else:
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary)
            data = numerical.NumericalColumn(
                data=padata,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
                mask=pamask,
                size=pa_size,
                offset=pa_offset,
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = ColumnBase._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(
                pa.array(np.asarray(arbitrary), from_pandas=True),
                dtype=arbitrary.dtype,
            )
        else:
            data = as_column(
                pa.array(arbitrary, from_pandas=nan_as_null),
                dtype=arbitrary.dtype,
            )

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        length = length or 1
        data = as_column(
            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype))
        if not nan_as_null:
            if np.issubdtype(data.dtype, np.floating):
                data = data.fillna(np.nan)
            elif np.issubdtype(data.dtype, np.datetime64):
                data = data.fillna(np.datetime64("NaT"))

    elif hasattr(arbitrary, "__array_interface__"):
        # CUDF assumes values are always contiguous
        desc = arbitrary.__array_interface__
        shape = desc["shape"]
        arb_dtype = np.dtype(desc["typestr"])
        # CUDF assumes values are always contiguous
        if len(shape) > 1:
            raise ValueError("Data must be 1-dimensional")

        arbitrary = np.asarray(arbitrary)
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arb_dtype.kind == "M":

            time_unit, _ = np.datetime_data(arbitrary.dtype)
            cast_dtype = time_unit in ("D", "W", "M", "Y")

            if cast_dtype:
                arbitrary = arbitrary.astype(np.dtype("datetime64[s]"))

            buffer = Buffer(arbitrary)
            mask = None
            if nan_as_null:
                data = as_column(buffer,
                                 dtype=arbitrary.dtype,
                                 nan_as_null=nan_as_null)
                data = utils.time_col_replace_nulls(data)
                mask = data.mask

            data = datetime.DatetimeColumn(data=buffer,
                                           mask=mask,
                                           dtype=arbitrary.dtype)
        elif arb_dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary),
                             dtype=arbitrary.dtype)
        else:
            data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, memoryview):
        data = as_column(np.asarray(arbitrary),
                         dtype=dtype,
                         nan_as_null=nan_as_null)

    else:
        try:
            data = as_column(memoryview(arbitrary),
                             dtype=dtype,
                             nan_as_null=nan_as_null)
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(
                        arbitrary,
                        type=pa_type,
                        from_pandas=True
                        if nan_as_null is None else nan_as_null,
                    ),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    sr = pd.Series(arbitrary, dtype="category")
                    data = as_column(sr, nan_as_null=nan_as_null)
                elif np_type == np.str_:
                    sr = pd.Series(arbitrary, dtype="str")
                    data = as_column(sr, nan_as_null=nan_as_null)
                else:
                    data = as_column(
                        np.asarray(arbitrary, dtype=np.dtype(dtype)),
                        nan_as_null=nan_as_null,
                    )
    return data
Ejemplo n.º 16
0
    def test_datetime(self):
        dt = np.datetime64('2000-01', ('M', 2))
        assert np.datetime_data(dt) == ('M', 2)

        with pytest.raises(TypeError):
            np.datetime64('2000', garbage=True)
Ejemplo n.º 17
0
    def test_datetime(self):
        dt = np.datetime64("2000-01", ("M", 2))
        assert np.datetime_data(dt) == ("M", 2)

        with pytest.raises(TypeError):
            np.datetime64("2000", garbage=True)