def _simple_new(cls, values, freq=None, tz=None, **kwargs): """ we require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor """ if getattr(values, 'dtype', None) is None: # empty, but with dtype compat if values is None: values = np.empty(0, dtype=_NS_DTYPE) return cls(values, freq=freq, tz=tz, **kwargs) values = np.array(values, copy=False) if not is_datetime64_dtype(values): values = ensure_int64(values).view(_NS_DTYPE) result = object.__new__(cls) result._data = values result._freq = freq tz = timezones.maybe_get_tz(tz) result._tz = timezones.tz_standardize(tz) return result
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self._assert_take_fillable(self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=iNaT) # keep freq in PeriodArray/Index, reset otherwise freq = self.freq if is_period_dtype(self) else None return self._shallow_copy(taken, freq=freq)
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): """ Reconstruct labels from observed group ids. Parameters ---------- comp_ids : np.ndarray[np.intp] xnull : bool If nulls are excluded; i.e. -1 labels are passed through. """ if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") shape = np.asarray(shape, dtype="i8") + lift if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! out = decons_group_index(obs_ids, shape) return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] # TODO: unique_label_indices only used here, should take ndarray[np.intp] i = unique_label_indices(ensure_int64(comp_ids)) i8copy = lambda a: a.astype("i8", subok=False, copy=True) return [i8copy(lab[i]) for lab in labels]
def get_group_index_sorter(group_index, ngroups): """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) that is, linear in the number of combinations (cartesian product) of unique values of groupby keys. This can be huge when doing multi-key groupby. np.argsort(kind='mergesort') is O(count x log(count)) where count is the length of the data-frame; Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) if do_groupsort: sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) return ensure_platform_int(sorter) else: return group_index.argsort(kind="mergesort")
def _take_preprocess_indexer_and_fill_value( arr: np.ndarray, indexer: Optional[np.ndarray], axis: int, out: Optional[np.ndarray], fill_value, allow_fill: bool, ): mask_info = None if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: indexer = ensure_int64(indexer, copy=False) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer mask = indexer == -1 needs_masking = mask.any() mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() return indexer, dtype, fill_value, mask_info
def group_info(self): comp_ids, obs_group_ids = self._get_compressed_labels() ngroups = len(obs_group_ids) comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups
def get_group_index( labels, shape: Shape, sort: bool, xnull: bool ) -> npt.NDArray[np.int64]: """ For the particular label_list, gets the offsets into the hypothetical list representing the totally ordered cartesian product of all possible label combinations, *as long as* this space fits within int64 bounds; otherwise, though group indices identify unique combinations of labels, they cannot be deconstructed. - If `sort`, rank of returned ids preserve lexical ranks of labels. i.e. returned id's can be used to do lexical sort on labels; - If `xnull` nulls (-1 labels) are passed through. Parameters ---------- labels : sequence of arrays Integers identifying levels at each location shape : tuple[int, ...] Number of unique levels at each location sort : bool If the ranks of returned ids should match lexical ranks of labels xnull : bool If true nulls are excluded. i.e. -1 values in the labels are passed through. Returns ------- An array of type int64 where two elements are equal if their corresponding labels are equal at all location. Notes ----- The length of `labels` and `shape` must be identical. """ def _int64_cut_off(shape) -> int: acc = 1 for i, mul in enumerate(shape): acc *= int(mul) if not acc < lib.i8max: return i return len(shape) def maybe_lift(lab, size) -> tuple[np.ndarray, int]: # promote nan values (assigned -1 label in lab array) # so that all output values are non-negative return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) labels = [ensure_int64(x) for x in labels] lshape = list(shape) if not xnull: for i, (lab, size) in enumerate(zip(labels, shape)): lab, size = maybe_lift(lab, size) labels[i] = lab lshape[i] = size labels = list(labels) # Iteratively process all the labels in chunks sized so less # than lib.i8max unique int ids will be required for each chunk while True: # how many levels can be done without overflow: nlev = _int64_cut_off(lshape) # compute flat ids for the first `nlev` levels stride = np.prod(lshape[1:nlev], dtype="i8") out = stride * labels[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): if lshape[i] == 0: stride = np.int64(0) else: stride //= lshape[i] out += labels[i] * stride if xnull: # exclude nulls mask = labels[0] == -1 for lab in labels[1:nlev]: mask |= lab == -1 out[mask] = -1 if nlev == len(lshape): # all levels done! break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted comp_ids, obs_ids = compress_group_index(out, sort=sort) labels = [comp_ids] + labels[nlev:] lshape = [len(obs_ids)] + lshape[nlev:] return out
def _bins_to_cuts( x, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, dtype=None, duplicates: str = "raise", ): if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, " "valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = Categorical.from_codes(ids, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( f"Bin edges must be unique: {repr(bins)}.\n" f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) else: bins = unique_bins side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if not (labels is None or is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument" ) elif labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, 'dtype'): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # treat as multiples of the given unit. If after converting to nanos, # there are fractional components left, these are truncated # (i.e. NOT rounded) mask = np.isnan(data) coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') data = (coeff * data).astype(np.int64).view('timedelta64[ns]') data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False elif is_datetime64_dtype(data): # GH#23539 warnings.warn( "Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", FutureWarning, stacklevel=4) data = ensure_int64(data).view(_TD_DTYPE) else: raise TypeError( "dtype {dtype} cannot be converted to timedelta64[ns]".format( dtype=data.dtype)) data = np.array(data, copy=copy) assert data.dtype == 'm8[ns]', data return data, inferred_freq
def _reindex_indexer( self: T, new_axis, indexer, axis: int, fill_value=None, allow_dups: bool = False, copy: bool = True, ) -> T: """ Parameters ---------- new_axis : Index indexer : ndarray of int64 or None axis : int fill_value : object, default None allow_dups : bool, default False copy : bool, default True pandas-indexer with -1's only. """ if indexer is None: if new_axis is self._axes[axis] and not copy: return self result = self.copy(deep=copy) result._axes = list(self._axes) result._axes[axis] = new_axis return result # some axes don't allow reindexing with dups if not allow_dups: self._axes[axis]._validate_can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 1: new_arrays = [] for i in indexer: if i == -1: arr = self._make_na_array(fill_value=fill_value) else: arr = self.arrays[i] new_arrays.append(arr) else: validate_indices(indexer, len(self._axes[0])) indexer = ensure_int64(indexer) if (indexer == -1).any(): allow_fill = True else: allow_fill = False new_arrays = [ take_1d( arr, indexer, allow_fill=allow_fill, fill_value=fill_value, # if fill_value is not None else blk.fill_value ) for arr in self.arrays ] new_axes = list(self._axes) new_axes[axis] = new_axis return type(self)(new_arrays, new_axes, verify_integrity=False)
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int mask = np.isnan(data) m, p = precision_from_unit(unit) base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False elif is_datetime64_dtype(data): # GH#23539 warnings.warn( "Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", FutureWarning, stacklevel=4, ) data = ensure_int64(data).view(_TD_DTYPE) else: raise TypeError( "dtype {dtype} cannot be converted to timedelta64[ns]".format( dtype=data.dtype)) data = np.array(data, copy=copy) if data.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") assert data.dtype == "m8[ns]", data return data, inferred_freq
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def _generate_range(cls, start, end, periods, freq, tz=None, normalize=False, ambiguous='raise', closed=None): if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') freq = to_offset(freq) if start is not None: start = Timestamp(start) if end is not None: end = Timestamp(end) if start is None and end is None: if closed is not None: raise ValueError("Closed has to be None if not both of start" "and end are defined") left_closed, right_closed = dtl.validate_endpoints(closed) start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize) tz, _ = _infer_tz_from_endpoints(start, end, tz) if tz is not None: # Localize the start and end arguments start = _maybe_localize_point( start, getattr(start, 'tz', None), start, freq, tz ) end = _maybe_localize_point( end, getattr(end, 'tz', None), end, freq, tz ) if start and end: # Make sure start and end have the same tz start = _maybe_localize_point( start, start.tz, end.tz, freq, tz ) end = _maybe_localize_point( end, end.tz, start.tz, freq, tz ) if freq is not None: if cls._use_cached_range(freq, _normalized, start, end): # Currently always False; never hit # Should be reimplemented as apart of GH 17914 index = cls._cached_range(start, end, periods=periods, freq=freq) else: index = _generate_regular_range(cls, start, end, periods, freq) if tz is not None and getattr(index, 'tz', None) is None: arr = conversion.tz_localize_to_utc( ensure_int64(index.values), tz, ambiguous=ambiguous) index = cls(arr) # index is localized datetime64 array -> have to convert # start/end as well to compare if start is not None: start = start.tz_localize(tz).asm8 if end is not None: end = end.tz_localize(tz).asm8 else: # Create a linearly spaced date_range in local time arr = np.linspace(start.value, end.value, periods) index = cls._simple_new( arr.astype('M8[ns]', copy=False), freq=None, tz=tz ) if not left_closed and len(index) and index[0] == start: index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] return cls._simple_new(index.values, freq=freq, tz=tz)
def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: return data.copy() else: return data._shallow_copy() freq, freq_infer = dtl.maybe_infer_freq(freq) if data is None: # TODO: Remove this block and associated kwargs; GH#20535 result = cls._generate_range(start, end, periods, freq, closed=closed) result.name = name return result if unit is not None: data = to_timedelta(data, unit=unit, box=False) if is_scalar(data): raise ValueError( 'TimedeltaIndex() must be called with a ' 'collection of some kind, {data} was passed'.format( data=repr(data))) # convert if not already if getattr(data, 'dtype', None) != _TD_DTYPE: data = to_timedelta(data, unit=unit, box=False) elif copy: data = np.array(data, copy=True) data = np.array(data, copy=False) if data.dtype == np.object_: data = array_to_timedelta64(data) if data.dtype != _TD_DTYPE: if is_timedelta64_dtype(data): # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) else: data = ensure_int64(data).view(_TD_DTYPE) assert data.dtype == 'm8[ns]', data.dtype subarr = cls._simple_new(data, name=name, freq=freq) # check that we are matching freqs if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: cls._validate_frequency(subarr, freq) if freq_infer: inferred = subarr.inferred_freq if inferred: subarr.freq = to_offset(inferred) return subarr
def __init__(self, data, labels, ngroups, axis=0): self.data = data self.labels = ensure_int64(labels) self.ngroups = ngroups self.axis = axis
def _generate_range(cls, start, end, periods, freq, tz=None, normalize=False, ambiguous='raise', closed=None): if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, ' 'and freq, exactly three must be specified') freq = to_offset(freq) if start is not None: start = Timestamp(start) if end is not None: end = Timestamp(end) if start is None and end is None: if closed is not None: raise ValueError("Closed has to be None if not both of start" "and end are defined") left_closed, right_closed = dtl.validate_endpoints(closed) start, end, _normalized = _maybe_normalize_endpoints( start, end, normalize) tz, inferred_tz = _infer_tz_from_endpoints(start, end, tz) if hasattr(freq, 'delta') and freq != Day(): # sub-Day Tick if inferred_tz is None and tz is not None: # naive dates if start is not None and start.tz is None: start = start.tz_localize(tz, ambiguous=False) if end is not None and end.tz is None: end = end.tz_localize(tz, ambiguous=False) if start and end: if start.tz is None and end.tz is not None: start = start.tz_localize(end.tz, ambiguous=False) if end.tz is None and start.tz is not None: end = end.tz_localize(start.tz, ambiguous=False) if cls._use_cached_range(freq, _normalized, start, end): index = cls._cached_range(start, end, periods=periods, freq=freq) else: index = _generate_regular_range(cls, start, end, periods, freq) else: if tz is not None: # naive dates if start is not None and start.tz is not None: start = start.replace(tzinfo=None) if end is not None and end.tz is not None: end = end.replace(tzinfo=None) if start and end: if start.tz is None and end.tz is not None: end = end.replace(tzinfo=None) if end.tz is None and start.tz is not None: start = start.replace(tzinfo=None) if freq is not None: if cls._use_cached_range(freq, _normalized, start, end): index = cls._cached_range(start, end, periods=periods, freq=freq) else: index = _generate_regular_range(cls, start, end, periods, freq) if tz is not None and getattr(index, 'tz', None) is None: arr = conversion.tz_localize_to_utc(ensure_int64( index.values), tz, ambiguous=ambiguous) index = cls(arr) # index is localized datetime64 array -> have to convert # start/end as well to compare if start is not None: start = start.tz_localize(tz).asm8 if end is not None: end = end.tz_localize(tz).asm8 else: # Create a linearly spaced date_range in local time start = start.tz_localize(tz) end = end.tz_localize(tz) arr = np.linspace(start.value, end.value, periods) index = cls._simple_new(arr.astype('M8[ns]'), freq=None, tz=tz) if not left_closed and len(index) and index[0] == start: index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] return cls._simple_new(index.values, freq=freq, tz=tz)
def func2(arr, indexer, out, fill_value=np.nan): indexer = ensure_int64(indexer) _take_nd_object( arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info )