def from_numpy_dtype(self, dt): """ From Numpy dtype. >>> from datashape import CType >>> from numpy import dtype >>> CType.from_numpy_dtype(dtype('int32')) ctype("int32") >>> CType.from_numpy_dtype(dtype('i8')) ctype("int64") >>> CType.from_numpy_dtype(dtype('M8')) DateTime(None) >>> CType.from_numpy_dtype(dtype('U30')) ctype("string[30, 'U32']") """ try: return Type.lookup_type(dt.name) except KeyError: pass if np.issubdtype(dt, np.datetime64): unit, _ = np.datetime_data(dt) defaults = {'D': date_, 'Y': date_, 'M': date_, 'W': date_} return defaults.get(unit, datetime_) elif np.issubdtype(dt, np.timedelta64): unit, _ = np.datetime_data(dt) return TimeDelta(unit=unit) elif np.issubdtype(dt, np.unicode_): return String(dt.itemsize // 4, 'U32') elif np.issubdtype(dt, np.str_) or np.issubdtype(dt, np.bytes_): return String(dt.itemsize, 'ascii') raise NotImplementedError("NumPy datatype %s not supported" % dt)
def _validate_date_like_dtype(dtype): try: typ = np.datetime_data(dtype)[0] except ValueError as e: raise TypeError('%s' % e) if typ != 'generic' and typ != 'ns': raise ValueError('%r is too specific of a frequency, try passing %r' % (dtype.name, dtype.type.__name__))
def _datetime_metadata_str(dtype): # TODO: this duplicates the C append_metastr_to_string unit, count = np.datetime_data(dtype) if unit == 'generic': return '' elif count == 1: return '[{}]'.format(unit) else: return '[{}{}]'.format(count, unit)
def series_to_array(s, dshape=None, **kwargs): dtype = dshape_to_numpy(datashape.dshape(dshape)) sdtype = s.dtype values = s.values # don't lose precision of datetime64 more precise than microseconds if ((issubclass(sdtype.type, np.datetime64) and np.datetime_data(sdtype)[0] in higher_precision_freqs) or s.dtype == dtype): return values try: return values.astype(dtype) except ValueError: # object series and record dshape, e.g., a frame row return values
def _validate_date_like_dtype(dtype): """ Check whether the dtype is a date-like dtype. Raises an error if invalid. Parameters ---------- dtype : dtype, type The dtype to check. Raises ------ TypeError : The dtype could not be casted to a date-like dtype. ValueError : The dtype is an illegal date-like dtype (e.g. the the frequency provided is too specific) """ try: typ = np.datetime_data(dtype)[0] except ValueError as e: raise TypeError('%s' % e) if typ != 'generic' and typ != 'ns': raise ValueError('%r is too specific of a frequency, try passing %r' % (dtype.name, dtype.type.__name__))
def _validate_date_like_dtype(dtype) -> None: """ Check whether the dtype is a date-like dtype. Raises an error if invalid. Parameters ---------- dtype : dtype, type The dtype to check. Raises ------ TypeError : The dtype could not be casted to a date-like dtype. ValueError : The dtype is an illegal date-like dtype (e.g. the frequency provided is too specific) """ try: typ = np.datetime_data(dtype)[0] except ValueError as e: raise TypeError(e) from e if typ not in ["generic", "ns"]: raise ValueError( f"{repr(dtype.name)} is too specific of a frequency, " f"try passing {repr(dtype.type.__name__)}" )
def to_datetime( arg, errors="raise", dayfirst=False, yearfirst=False, utc=None, format=None, exact=True, unit="ns", infer_datetime_format=False, origin="unix", cache=True, ): """ Convert argument to datetime. Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. - If 'warn' : prints last exceptions as warnings and return the input. - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). format : str, default None The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin(unix epoch start). Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. Returns ------- datetime If parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> import cudf >>> df = cudf.DataFrame({'year': [2015, 2016], ... 'month': [2, 3], ... 'day': [4, 5]}) >>> cudf.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] >>> cudf.to_datetime(1490195805, unit='s') numpy.datetime64('2017-03-22T15:16:45.000000000') >>> cudf.to_datetime(1490195805433502912, unit='ns') numpy.datetime64('1780-11-20T01:02:30.494253056') """ if arg is None: return None if exact is False: raise NotImplementedError("exact support is not yet implemented") if origin != "unix": raise NotImplementedError("origin support is not yet implemented") if yearfirst: raise NotImplementedError("yearfirst support is not yet implemented") try: if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] req = list(set(required) - set(arg._data.names)) if len(req): req = ",".join(req) raise ValueError( f"to assemble mappings requires at least that " f"[year, month, day] be specified: [{req}] " f"is missing" ) # replace passed column name with values in _unit_map unit = {k: get_units(k) for k in arg._data.names} unit_rev = {v: k for k, v in unit.items()} # keys we don't recognize excess = set(unit_rev.keys()) - set(_unit_map.values()) if len(excess): excess = ",".join(excess) raise ValueError( f"extra keys have been passed to the " f"datetime assemblage: [{excess}]" ) new_series = ( arg[unit_rev["year"]].astype("str") + "-" + arg[unit_rev["month"]].astype("str").str.zfill(2) + "-" + arg[unit_rev["day"]].astype("str").str.zfill(2) ) format = "%Y-%m-%d" col = new_series._column.as_datetime_column( "datetime64[s]", format=format ) for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: arg_col = arg._data[value] if arg_col.dtype.kind in ("f"): col = new_series._column.as_datetime_column( "datetime64[ns]", format=format ) break elif arg_col.dtype.kind in ("O"): if not cpp_is_integer(arg_col).all(): col = new_series._column.as_datetime_column( "datetime64[ns]", format=format ) break times_column = None for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: current_col = arg._data[value] # If the arg[value] is of int or # float dtype we don't want to type-cast if current_col.dtype.kind in ("O"): try: current_col = current_col.astype(dtype="int64") except ValueError: current_col = current_col.astype(dtype="float64") factor = as_device_scalar( column.datetime._numpy_to_pandas_conversion[u] / ( column.datetime._numpy_to_pandas_conversion["s"] if np.datetime_data(col.dtype)[0] == "s" else 1 ) ) if times_column is None: times_column = current_col * factor else: times_column = times_column + (current_col * factor) if times_column is not None: col = (col.astype(dtype="int64") + times_column).astype( dtype=col.dtype ) return cudf.Series(col, index=arg.index) elif isinstance(arg, cudf.Index): col = arg._values col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) return as_index(col, name=arg.name) elif isinstance(arg, cudf.Series): col = arg._column col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) return cudf.Series(col, index=arg.index, name=arg.name) else: col = column.as_column(arg) col = _process_col( col=col, unit=unit, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format, format=format, ) if is_scalar(arg): return col[0] else: return as_index(col) except Exception as e: if errors == "raise": raise e elif errors == "warn": import traceback tb = traceback.format_exc() warnings.warn(tb) elif errors == "ignore": pass elif errors == "coerce": return np.datetime64("nat", "ns" if unit is None else unit) return arg
def append(self, table: str, data: Mapping[str, np.ndarray], schema: str = 'sys') -> None: """ Directly append an array structure """ self._switch() n_columns = len(data) existing_columns = list(self.get_columns(schema=schema, table=table)) existing_names, existing_types = zip(*existing_columns) if not set(existing_names) == set(data.keys()): error = f"Appended column names ({', '.join(str(i) for i in data.keys())}) " \ f"don't match existing column names ({', '.join(existing_names)})" raise exceptions.ProgrammingError(error) work_columns = ffi.new(f'monetdbe_column * [{n_columns}]') work_objs = [] # cffi_objects assists to keep all in-memory native data structure alive during the execution of this call cffi_objects = list() for column_num, (column_name, existing_type) in enumerate(existing_columns): column_values = data[column_name] work_column = ffi.new('monetdbe_column *') type_info = numpy_monetdb_map(column_values.dtype) # try to convert the values if types don't match if type_info.c_type != existing_type: if type_info.c_type == lib.monetdbe_timestamp and existing_type == lib.monetdbe_date and np.issubdtype( column_values.dtype, np.datetime64): """ We are going to cast to a monetdbe_date and consider monetdbe_timestamp as a 'base type' to signal this. """ type_info = timestamp_to_date() else: precision_warning(type_info.c_type, existing_type) to_numpy_type = monet_c_type_map[existing_type].numpy_type try: column_values = column_values.astype(to_numpy_type) type_info = numpy_monetdb_map(column_values.dtype) except Exception as e: existing_type_string = monet_c_type_map[ existing_type].c_string_type error = f"Can't convert '{type_info.c_string_type}' " \ f"to type '{existing_type_string}' for column '{column_name}': {e} " raise ValueError(error) work_column.type = type_info.c_type work_column.count = column_values.shape[0] work_column.name = ffi.new('char[]', column_name.encode()) if type_info.numpy_type.kind == 'M': t = ffi.new('monetdbe_data_timestamp[]', work_column.count) cffi_objects.append(t) unit = np.datetime_data(column_values.dtype)[0].encode() p = ffi.from_buffer("int64_t*", column_values) lib.initialize_timestamp_array_from_numpy( self._monetdbe_database, t, work_column.count, p, unit, existing_type) work_column.data = t elif type_info.numpy_type.kind == 'U': # first massage the numpy array of unicode into a matrix of null terminated rows of bytes. m = ffi.from_buffer( "bool*", column_values.mask) if np.ma.isMaskedArray( column_values) else 0 # type: ignore[attr-defined] cffi_objects.append(m) v = np.char.encode(column_values).view('b').reshape( (work_column.count, -1)) v = np.c_[v, np.zeros(work_column.count, dtype=np.int8)] stride_length = v.shape[1] cffi_objects.append(v) t = ffi.new('char*[]', work_column.count) cffi_objects.append(t) p = ffi.from_buffer("char*", v) cffi_objects.append(p) lib.initialize_string_array_from_numpy(t, work_column.count, p, stride_length, ffi.cast("bool*", m)) work_column.data = t else: p = ffi.from_buffer(f"{type_info.c_string_type}*", column_values) cffi_objects.append(p) work_column.data = p work_columns[column_num] = work_column work_objs.append(work_column) check_error( lib.monetdbe_append(self._monetdbe_database, schema.encode(), table.encode(), work_columns, n_columns))
np.can_cast(AR_f8, 1) # E: incompatible type np.vdot(AR_M, AR_M) # E: incompatible type np.copyto(AR_LIKE_f, AR_f8) # E: incompatible type np.putmask(AR_LIKE_f, [True, True, False], 1.5) # E: incompatible type np.packbits(AR_f8) # E: incompatible type np.packbits(AR_u1, bitorder=">") # E: incompatible type np.unpackbits(AR_i8) # E: incompatible type np.unpackbits(AR_u1, bitorder=">") # E: incompatible type np.shares_memory(1, 1, max_work=i8) # E: incompatible type np.may_share_memory(1, 1, max_work=i8) # E: incompatible type np.arange(M) # E: No overload variant np.arange(stop=10) # E: No overload variant np.datetime_data(int) # E: incompatible type np.busday_offset("2012", 10) # E: incompatible type np.datetime_as_string("2012") # E: incompatible type np.compare_chararrays("a", b"a", "==", False) # E: No overload variant np.add_docstring(func, None) # E: incompatible type
reveal_type(np.shares_memory(1, 2)) # E: bool reveal_type(np.shares_memory(AR_f8, AR_f8, max_work=1)) # E: bool reveal_type(np.may_share_memory(1, 2)) # E: bool reveal_type(np.may_share_memory(AR_f8, AR_f8, max_work=1)) # E: bool reveal_type(np.geterrobj()) # E: list[Any] reveal_type(np.seterrobj([8192, 521, None])) # E: None reveal_type(np.promote_types(np.int32, np.int64)) # E: numpy.dtype[Any] reveal_type(np.promote_types("f4", float)) # E: numpy.dtype[Any] reveal_type(np.frompyfunc(func, 1, 1, identity=None)) # numpy.ufunc reveal_type(np.datetime_data("m8[D]")) # E: Tuple[builtins.str, builtins.int] reveal_type(np.datetime_data( np.datetime64)) # E: Tuple[builtins.str, builtins.int] reveal_type(np.datetime_data(np.dtype( np.timedelta64))) # E: Tuple[builtins.str, builtins.int] reveal_type(np.busday_count("2011-01", "2011-02")) # E: {int_} reveal_type(np.busday_count( ["2011-01"], "2011-02")) # E: numpy.ndarray[Any, numpy.dtype[{int_}]] reveal_type(np.busday_offset(M, m)) # E: numpy.datetime64 reveal_type(np.busday_offset(M, 5)) # E: numpy.datetime64 reveal_type(np.busday_offset( AR_M, m)) # E: numpy.ndarray[Any, numpy.dtype[numpy.datetime64]] reveal_type(np.busday_offset("2011-01", "2011-02", roll="forward")) # E: numpy.datetime64
# In[47]: import numpy as np # In[48]: nd = np.datetime64('2015-10-31') nd # In[49]: np.datetime_as_string(nd) # In[50]: np.datetime_data(nd) # In[51]: d # In[52]: nd = np.datetime64(d) nd # In[53]: nd.astype(dt.datetime) # In[54]:
def h5_velocity_by_intervals_gen( cfg: Mapping[str, Any], cfg_out: Mapping[str, Any]) -> Iterator[Tuple[str, Tuple[Any, ...]]]: """ Loads data and calculates velocity: many intervals from many of hdf5 tables sequentially. :param cfg: dict with fields: ['proc']['dt_interval'] - numpy.timedelta64 time interval of loading data one group of fields: 1. 'split_period', pandas interval str, as required by intervals_from_period() to cover all data by it 'overlap' 2. 'time_intervals_start' - manually specified starts of intercals :param cfg_out: fields must be provided: - see h5_names_gen(cfg_in, cfg_out) requirements :return: """ # Prepare cycle if cfg_out.get('split_period'): def gen_loaded(tbl): """ Variant 1. Generate regular intervals (may be with overlap) :param tbl: :return: """ cfg['in']['table'] = tbl # To obtain ``t_intervals_start`` used in query inside gen_data_on_intervals(cfg_out, cfg) # we copy its content here: t_prev_interval_start, t_intervals_start = intervals_from_period( **cfg['in'], period=cfg_out['split_period']) if cfg['proc']['overlap']: dt_shifts = np.arange( 0, 1, (1 - cfg['proc']['overlap'])) * pd_period_to_timedelta( cfg_out['split_period']) t_intervals_start = (t_intervals_start.to_numpy( dtype="datetime64[ns]")[np.newaxis].T + dt_shifts).flatten() if cfg['in']['max_date']: idel = t_intervals_start.searchsorted( np.datetime64( cfg['in']['max_date'] - pd_period_to_timedelta(cfg_out['split_period']))) t_intervals_start = t_intervals_start[:idel] cfg['in'][ 'time_intervals_start'] = t_intervals_start # to save queried time - see main() cfg_filter = None cfg_in_columns_saved = cfg['in']['columns'] for start_end in h5q_starts2coord( cfg['in']['db_path'], cfg['in']['table'], t_intervals_start, dt_interval=cfg['proc']['dt_interval']): a = h5_load_range_by_coord(**cfg['in'], range_coordinates=start_end) if cfg_filter is None: # only 1 time # corrects columns if they are not exact mutch to faster h5_load_range_by_coord() next time cfg['in']['columns'] = a.columns # temporary # and exclude absent fields to not filter warning of no such column in filt_data_dd() detect_filt = f"m(ax|in)_({'|'.join(cfg['in']['columns'])})" cfg_filter = { k: v for k, v in cfg['filter'].items() if re.match(detect_filt, k) } d, i_burst = filt_data_dd(a, cfg['in']['dt_between_bursts'], cfg['in']['dt_hole_warning'], cfg_filter) n_bursts = len(i_burst) if n_bursts > 1: # 1st is always 0 l.info('gaps found: (%s)! at %s', n_bursts - 1, i_burst[1:] - 1) df0 = d.compute() if not len(df0): continue start_end = df0.index[[0, -1]].values yield df0, start_end cfg['in'][ 'columns'] = cfg_in_columns_saved # recover to not affect next file else: query_range_pattern = "index>=Timestamp('{}') & index<=Timestamp('{}')" def gen_loaded(tbl): """ Variant 2. Generate intervals at specified start values with same width cfg['proc']['dt_interval'] :param tbl: :return: """ for start_end in zip( cfg['in']['time_intervals_start'], cfg['in']['time_intervals_start'] + cfg['proc']['dt_interval']): query_range_lims = pd.to_datetime(start_end) qstr = query_range_pattern.format(*query_range_lims) l.info(f'query:\n%s... ', qstr) df0 = store.select(tbl, where=qstr, columns=None) yield df0, start_end dt_interval_in_its_units = cfg['proc']['dt_interval'].astype(int) dt_interval_units = np.datetime_data(cfg['proc']['dt_interval'])[0] data_name_suffix = f'{dt_interval_in_its_units}{dt_interval_units}' # Cycle with pd.HDFStore(cfg['in']['db_path'], mode='r') as store: for (tbl, coefs) in h5_names_gen(cfg['in'], cfg_out): # Get data in ranges for df0, start_end in gen_loaded(tbl): if cfg['in']['db_path'].stem.endswith('proc_noAvg'): df = df0 else: # loading source data needed to be processed to calc velocity df0 = filter_local(df0, cfg['filter']) df = incl_calc_velocity_nodask(df0, **coefs, cfg_filter=cfg['in'], cfg_proc=cfg['proc']) data_name = f'{tbl}/PSD_{start_end[0]}{data_name_suffix}' yield (df, tbl, data_name)
def loc_to_iloc( cls, *, label_to_pos: tp.Dict[tp.Hashable, int], labels: np.ndarray, positions: np.ndarray, key: GetItemKeyType, offset: tp.Optional[int] = None, partial_selection: bool = False, ) -> GetItemKeyType: ''' Note: all SF objects (Series, Index) need to be converted to basic types before being passed as `key` to this function. Args: offset: in the context of an IndexHierarchical, the iloc positions returned from this funcition need to be shifted. partial_selection: if True and key is an iterable of labels that includes labels not in the mapping, available matches will be returned rather than raising. Returns: An integer mapped slice, or GetItemKey type that is based on integers, compatible with TypeBlocks ''' # NOTE: ILoc is handled prior to this call, in the Index._loc_to_iloc method offset_apply = not offset is None if key.__class__ is slice: if key == NULL_SLICE: if offset_apply: # when offset is defined (even if it is zero), null slice is not sufficiently specific; need to convert to an explicit slice relative to the offset return slice(offset, len(positions) + offset) #type: ignore else: return NULL_SLICE try: return slice(*cls.map_slice_args( label_to_pos.get, #type: ignore key, labels, offset)) except LocEmpty: return EMPTY_SLICE labels_is_dt64 = labels.dtype.kind == DTYPE_DATETIME_KIND if key.__class__ is np.datetime64: # if we have a single dt64, convert this to the key's unit and do a Boolean selection if the key is a less-granular unit if (labels.dtype == DTYPE_OBJECT and np.datetime_data(key.dtype)[0] in DTYPE_OBJECTABLE_DT64_UNITS): #type: ignore key = key.astype(DTYPE_OBJECT) #type: ignore elif labels_is_dt64 and key.dtype < labels.dtype: #type: ignore key = labels.astype(key.dtype) == key #type: ignore # if not different type, keep it the same so as to do a direct, single element selection is_array = key.__class__ is np.ndarray is_list = isinstance(key, list) # can be an iterable of labels (keys) or an iterable of Booleans if is_array or is_list: if is_array and key.dtype.kind == DTYPE_DATETIME_KIND: #type: ignore if (labels.dtype == DTYPE_OBJECT and np.datetime_data(key.dtype)[0] in DTYPE_OBJECTABLE_DT64_UNITS): #type: ignore # if key is dt64 and labels are object, then for objectable units we can convert key to object to permit matching in the AutoMap # NOTE: tolist() is expected to be faster than astype object for smaller collections key = key.tolist() #type: ignore is_array = False is_list = True elif labels_is_dt64 and key.dtype < labels.dtype: #type: ignore # change the labels to the dt64 dtype, i.e., if the key is years, recast the labels as years, and do a Boolean selection of everything that matches each key labels_ref = labels.astype(key.dtype) # type: ignore # NOTE: this is only correct if both key and labels are dt64, and key is a less granular unit, as the order in the key and will not be used # let Boolean key advance to next branch key = reduce(OPERATORS['__or__'], (labels_ref == k for k in key)) # type: ignore if is_array and key.dtype == DTYPE_BOOL: #type: ignore if offset_apply: return positions[key] + offset return positions[key] # map labels to integer positions, return a list of integer positions # NOTE: we may miss the opportunity to identify contiguous keys and extract a slice # NOTE: we do more branching here to optimize performance if partial_selection: if offset_apply: return [ label_to_pos[k] + offset for k in key if k in label_to_pos ] #type: ignore return [label_to_pos[k] for k in key if k in label_to_pos] # type: ignore if offset_apply: return [label_to_pos[k] + offset for k in key] #type: ignore return [label_to_pos[k] for k in key] # type: ignore # if a single element (an integer, string, or date, we just get the integer out of the map if offset_apply: return label_to_pos[key] + offset #type: ignore return label_to_pos[key] #type: ignore
def _preprocess_host_value(self, value, dtype): valid = not cudf._lib.scalar._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: raise TypeError("Lists may not be cast to a different dtype") else: dtype = ListDtype.from_arrow( pa.infer_type([value], from_pandas=True) ) return value, dtype elif isinstance(dtype, ListDtype): if value not in {None, NA}: raise ValueError(f"Can not coerce {value} to ListDtype") else: return NA, dtype if isinstance(value, dict): if dtype is None: dtype = StructDtype.from_arrow( pa.infer_type([value], from_pandas=True) ) return value, dtype elif isinstance(dtype, StructDtype): if value not in {None, NA}: raise ValueError(f"Can not coerce {value} to StructDType") else: return NA, dtype if isinstance(dtype, cudf.core.dtypes.DecimalDtype): value = pa.scalar( value, type=pa.decimal128(dtype.precision, dtype.scale) ).as_py() if isinstance(value, decimal.Decimal) and dtype is None: dtype = cudf.Decimal128Dtype._from_decimal(value) value = to_cudf_compatible_scalar(value, dtype=dtype) if dtype is None: if not valid: if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": raise TypeError( "Cant convert generic NaT to null scalar" ) else: dtype = value.dtype else: raise TypeError( "dtype required when constructing a null scalar" ) else: dtype = value.dtype if not isinstance(dtype, cudf.core.dtypes.DecimalDtype): dtype = cudf.dtype(dtype) if not valid: value = NA return value, dtype
def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): """Create a Column from an arbitrary object Parameters ---------- arbitrary : object Object to construct the Column from. See *Notes*. nan_as_null : bool, optional, default None If None (default), treats NaN values in arbitrary as null if there is no mask passed along with it. If True, combines the mask and NaNs to form a new validity mask. If False, leaves NaN values as is. dtype : optional Optionally typecast the construted Column to the given dtype. length : int, optional If `arbitrary` is a scalar, broadcast into a Column of the given length. Returns ------- A Column of the appropriate type and size. Notes ----- Currently support inputs are: * ``Column`` * ``Series`` * ``Index`` * Scalars (can be broadcasted to a specified `length`) * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects """ from cudf.core.column import numerical, categorical, datetime, string from cudf.core.series import Series from cudf.core.index import Index if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) # TODO: Remove nvstrings here when nvstrings is fully removed elif isinstance(arbitrary, nvstrings.nvstrings): byte_count = arbitrary.byte_count() if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError("Cannot construct string columns " "containing > {} bytes. " "Consider using dask_cudf to partition " "your data.".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR)) sbuf = Buffer.empty(arbitrary.byte_count()) obuf = Buffer.empty( (arbitrary.size() + 1) * np.dtype("int32").itemsize) nbuf = None if arbitrary.null_count() > 0: nbuf = create_null_mask(arbitrary.size(), state=MaskState.UNINITIALIZED) arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True) arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True) children = ( build_column(obuf, dtype="int32"), build_column(sbuf, dtype="int8"), ) data = build_column(data=None, dtype="object", mask=nbuf, children=children) data._nvstrings = arbitrary elif isinstance(arbitrary, Buffer): if dtype is None: raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer") data = build_column(arbitrary, dtype=dtype) elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ dtype = np.dtype(desc["typestr"]) data = _data_from_cuda_array_interface_desc(arbitrary) mask = _mask_from_cuda_array_interface_desc(arbitrary) col = build_column(data, dtype=dtype, mask=mask) if np.issubdtype(col.dtype, np.floating): if nan_as_null or (mask is None and nan_as_null is None): mask = libcudfxx.transform.nans_to_nulls(col.fillna(np.nan)) col = col.set_mask(mask) elif np.issubdtype(col.dtype, np.datetime64): if nan_as_null or (mask is None and nan_as_null is None): col = utils.time_col_replace_nulls(col) return col elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( arbitrary) children = ( build_column(data=obuf, dtype="int32"), build_column(data=sbuf, dtype="int8"), ) data = string.StringColumn(mask=nbuf, children=children, size=pa_size, offset=pa_offset) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype()) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): codes = as_column(arbitrary.indices) if isinstance(arbitrary.dictionary, pa.NullArray): categories = as_column([], dtype="object") else: categories = as_column(arbitrary.dictionary) dtype = CategoricalDtype(categories=categories, ordered=arbitrary.type.ordered) data = categorical.CategoricalColumn( dtype=dtype, mask=codes.base_mask, children=(codes, ), size=codes.size, offset=codes.offset, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype="M8[ms]") data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=np.dtype("M8[ms]"), size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype) data = numerical.NumericalColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) else: pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary) data = numerical.NumericalColumn( data=padata, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), mask=pamask, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column( pa.array(np.asarray(arbitrary), from_pandas=True), dtype=arbitrary.dtype, ) else: data = as_column( pa.array(arbitrary, from_pandas=nan_as_null), dtype=arbitrary.dtype, ) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)) if not nan_as_null: if np.issubdtype(data.dtype, np.floating): data = data.fillna(np.nan) elif np.issubdtype(data.dtype, np.datetime64): data = data.fillna(np.datetime64("NaT")) elif hasattr(arbitrary, "__array_interface__"): # CUDF assumes values are always contiguous desc = arbitrary.__array_interface__ shape = desc["shape"] arb_dtype = np.dtype(desc["typestr"]) # CUDF assumes values are always contiguous if len(shape) > 1: raise ValueError("Data must be 1-dimensional") arbitrary = np.asarray(arbitrary) if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arb_dtype.kind == "M": time_unit, _ = np.datetime_data(arbitrary.dtype) cast_dtype = time_unit in ("D", "W", "M", "Y") if cast_dtype: arbitrary = arbitrary.astype(np.dtype("datetime64[s]")) buffer = Buffer(arbitrary) mask = None if nan_as_null: data = as_column(buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null) data = utils.time_col_replace_nulls(data) mask = data.mask data = datetime.DatetimeColumn(data=buffer, mask=mask, dtype=arbitrary.dtype) elif arb_dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype) else: data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array( arbitrary, type=pa_type, from_pandas=True if nan_as_null is None else nan_as_null, ), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( np.asarray(arbitrary, dtype=np.dtype(dtype)), nan_as_null=nan_as_null, ) return data
def test_datetime(self): dt = np.datetime64('2000-01', ('M', 2)) assert np.datetime_data(dt) == ('M', 2) with pytest.raises(TypeError): np.datetime64('2000', garbage=True)
def test_datetime(self): dt = np.datetime64("2000-01", ("M", 2)) assert np.datetime_data(dt) == ("M", 2) with pytest.raises(TypeError): np.datetime64("2000", garbage=True)