def get_group_index_sorter(group_index, ngroups): """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) that is, linear in the number of combinations (cartesian product) of unique values of groupby keys. This can be huge when doing multi-key groupby. np.argsort(kind='mergesort') is O(count x log(count)) where count is the length of the data-frame; Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters do_groupsort = (count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))) if do_groupsort: sorter, _ = algos.groupsort_indexer(_ensure_int64(group_index), ngroups) return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort')
def _make_selectors(self): new_levels = self.new_index_levels # make the mask remaining_labels = self.sorted_labels[:-1] level_sizes = [len(x) for x in new_levels] comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ValueError("Index contains duplicate entries, " "cannot reshape") self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): # we have the same codes codes = target.codes else: if isinstance(target, CategoricalIndex): target = target.categories codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def _make_selectors(self): new_levels = self.new_index_levels # make the mask remaining_labels = self.sorted_labels[:-1] level_sizes = [len(x) for x in new_levels] comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ValueError('Index contains duplicate entries, ' 'cannot reshape') self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = _ensure_platform_int(indices) left, right = self.left, self.right if fill_value is None: fill_value = self._na_value mask = indices == -1 if not mask.any(): # we won't change dtype here in this case # if we don't need allow_fill = False taker = lambda x: x.take( indices, allow_fill=allow_fill, fill_value=fill_value) try: new_left = taker(left) new_right = taker(right) except ValueError: # we need to coerce; migth have NA's in an # interger dtype new_left = taker(left.astype(float)) new_right = taker(right.astype(float)) return self._shallow_copy(new_left, new_right)
def get_indexer(self, target, method=None, limit=None, tolerance=None): self._check_method(method) target = _ensure_index(target) target = self._maybe_cast_indexed(target) if self.equals(target): return np.arange(len(self), dtype='intp') if self.is_non_overlapping_monotonic: start, stop = self._find_non_overlapping_monotonic_bounds(target) start_plus_one = start + 1 if not ((start_plus_one < stop).any()): return np.where(start_plus_one == stop, start, -1) if not self.is_unique: raise ValueError("cannot handle non-unique indices") # IntervalIndex if isinstance(target, IntervalIndex): indexer = self._get_reindexer(target) # non IntervalIndex else: indexer = np.concatenate([self.get_loc(i) for i in target]) return _ensure_platform_int(indexer)
def get_group_index_sorter(group_index, ngroups): """ _algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) that is, linear in the number of combinations (cartesian product) of unique values of groupby keys. This can be huge when doing multi-key groupby. np.argsort(kind='mergesort') is O(count x log(count)) where count is the length of the data-frame; Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters do_groupsort = (count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))) if do_groupsort: sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort')
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): if mask.all(): continue values = series.values # .take returns SparseArray new = values.take(indexer) if need_mask: new = new.values np.putmask(new, mask, fill_value) new_series[col] = new return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self._default_fill_value)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=-1) return self._create_from_codes(taken)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): """ Sparse-compatible version of ndarray.take Returns ------- taken : ndarray """ nv.validate_take(tuple(), kwargs) if axis: raise ValueError("axis must be 0, input was {0}".format(axis)) if is_integer(indices): # return scalar return self[indices] indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, # self.fill_value may not be NaN if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) elif (n <= indices).any(): msg = 'index is out of bounds for size {0}' raise IndexError(msg.format(n)) else: if ((indices < -n) | (n <= indices)).any(): msg = 'index is out of bounds for size {0}' raise IndexError(msg.format(n)) indices = indices.astype(np.int32) if not (allow_fill and fill_value is not None): indices = indices.copy() indices[indices < 0] += n locs = self.sp_index.lookup_array(indices) indexer = np.arange(len(locs), dtype=np.int32) mask = locs != -1 if mask.any(): indexer = indexer[mask] new_values = self.sp_values.take(locs[mask]) else: indexer = np.empty(shape=(0, ), dtype=np.int32) new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) sp_index = _make_index(len(indices), indexer, kind=self.sp_index) return self._simple_new(new_values, sp_index, self.fill_value)
def test_transform_fast(self): df = DataFrame({ 'id': np.arange(100000) / 3, 'val': np.random.randn(100000) }) grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) # GH 12737 df = pd.DataFrame( { 'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], 'd': pd.date_range('2014-1-1', '2014-1-4'), 'i': [1, 2, 3, 4] }, columns=['grouping', 'f', 'i', 'd']) result = df.groupby('grouping').transform('first') dates = [ pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4') ] expected = pd.DataFrame( { 'f': [1.1, 2.1, 2.1, 4.5], 'd': dates, 'i': [1, 2, 2, 4] }, columns=['f', 'i', 'd']) assert_frame_equal(result, expected) # selection result = df.groupby('grouping')[['f', 'i']].transform('first') expected = expected[['f', 'i']] assert_frame_equal(result, expected) # dup columns df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) result = df.groupby('g').transform('first') expected = df.drop('g', axis=1) assert_frame_equal(result, expected)
def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: arr.take(_ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask out[tuple(outindexer)] = fill_value
def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: arr.take(_ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask out[tuple(outindexer)] = fill_value
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return self._constructor(index=index, columns=self.columns).__finalize__(self) indexer = self.index.get_indexer(index, method, limit=limit) indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): if mask.all(): continue values = series.values # .take returns SparseArray new = values.take(indexer) if need_mask: new = new.values # convert integer to float if necessary. need to do a lot # more than that, handle boolean etc also new, fill_value = maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new return self._constructor( new_series, index=index, columns=self.columns, default_fill_value=self._default_fill_value).__finalize__(self)
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return self._constructor( index=index, columns=self.columns).__finalize__(self) indexer = self.index.get_indexer(index, method, limit=limit) indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): if mask.all(): continue values = series.values # .take returns SparseArray new = values.take(indexer) if need_mask: new = new.values # convert integer to float if necessary. need to do a lot # more than that, handle boolean etc also new, fill_value = _maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new return self._constructor( new_series, index=index, columns=self.columns, default_fill_value=self._default_fill_value).__finalize__(self)
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1 :] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def get_indexer(self, target, method=None, limit=None, tolerance=None): """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. The mask determines whether labels are found or not in the current index Parameters ---------- target : MultiIndex or Index (of tuples) method : {'pad', 'ffill', 'backfill', 'bfill'} pad / ffill: propagate LAST valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap Notes ----- This is a low-level method and probably should be used at your own risk Examples -------- >>> indexer, mask = index.get_indexer(new_index) >>> new_values = cur_values.take(indexer) >>> new_values[-mask] = np.nan Returns ------- (indexer, mask) : (ndarray, ndarray) """ method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def test_transform_fast(self): df = DataFrame({'id': np.arange(100000) / 3, 'val': np.random.randn(100000)}) grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) # GH 12737 df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], 'd': pd.date_range('2014-1-1', '2014-1-4'), 'i': [1, 2, 3, 4]}, columns=['grouping', 'f', 'i', 'd']) result = df.groupby('grouping').transform('first') dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], 'd': dates, 'i': [1, 2, 2, 4]}, columns=['f', 'i', 'd']) assert_frame_equal(result, expected) # selection result = df.groupby('grouping')[['f', 'i']].transform('first') expected = expected[['f', 'i']] assert_frame_equal(result, expected) # dup columns df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) result = df.groupby('g').transform('first') expected = df.drop('g', axis=1) assert_frame_equal(result, expected)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") values = np.array(values, copy=False) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return _ensure_object(np.concatenate([nums, strs])) sorter = None if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = _ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = _get_data_algo(values, _hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = _ensure_platform_int(t.lookup(ordered)) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) # (Out of bound indices will be masked with `na_sentinel` next, so we may # deal with them here without performance loss using `mode='wrap'`.) new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) return ordered, _ensure_platform_int(new_labels)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.tz_localize(None) is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(_ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types)]]) sorter = _ensure_platform_int(t.lookup( _ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetimetz_type: # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( values.tz) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex, PeriodIndex # handling two possibilities here # - for a numpy datetimelike simply view as i8 then cast back # - for an extension datetimelike view as i8 then # reconstruct from boxed values to transfer metadata dtype = None if needs_i8_conversion(values): if is_period_dtype(values): values = PeriodIndex(values) vals = values.asi8 elif is_datetimetz(values): values = DatetimeIndex(values) vals = values.asi8 else: # numpy dtype dtype = values.dtype vals = values.view(np.int64) else: vals = np.asarray(values) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if dtype is not None: uniques = uniques.astype(dtype) if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.asi8 is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if is_datetimetz_type: # reset tz uniques = values._shallow_copy(uniques) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def pivot_annual(series, freq=None): """ Deprecated. Use ``pivot_table`` instead. Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, and as many columns as the length of a leap year in the units corresponding to the original frequency (366 for daily frequency, 366*24 for hourly...). The fist column of the output corresponds to Jan. 1st, 00:00:00, while the last column corresponds to Dec, 31st, 23:59:59. Entries corresponding to Feb. 29th are masked for non-leap years. For example, if the initial series has a daily frequency, the 59th column of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, and the 60th column is masked for non-leap years. With a hourly initial frequency, the (59*24)th column of the output always correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and the 24 columns between (59*24) and (61*24) are masked. If the original frequency is less than daily, the output is equivalent to ``series.convert('A', func=None)``. Parameters ---------- series : Series freq : string or None, default None Returns ------- annual : DataFrame """ msg = "pivot_annual is deprecated. Use pivot_table instead" warnings.warn(msg, FutureWarning) index = series.index year = index.year years = nanops.unique1d(year) if freq is not None: freq = freq.upper() else: freq = series.index.freq if freq == 'D': width = 366 offset = index.dayofyear - 1 # adjust for leap year offset[(~isleapyear(year)) & (offset >= 59)] += 1 columns = lrange(1, 367) # todo: strings like 1/1, 1/25, etc.? elif freq in ('M', 'BM'): width = 12 offset = index.month - 1 columns = lrange(1, 13) elif freq == 'H': width = 8784 grouped = series.groupby(series.index.year) defaulted = grouped.apply(lambda x: x.reset_index(drop=True)) defaulted.index = defaulted.index.droplevel(0) offset = np.asarray(defaulted.index) offset[~isleapyear(year) & (offset >= 1416)] += 24 columns = lrange(1, 8785) else: raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) values.put(flat_index, series.values) return DataFrame(values, index=years, columns=columns)
def pivot_annual(series, freq=None): """ Deprecated. Use ``pivot_table`` instead. Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, and as many columns as the length of a leap year in the units corresponding to the original frequency (366 for daily frequency, 366*24 for hourly...). The fist column of the output corresponds to Jan. 1st, 00:00:00, while the last column corresponds to Dec, 31st, 23:59:59. Entries corresponding to Feb. 29th are masked for non-leap years. For example, if the initial series has a daily frequency, the 59th column of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, and the 60th column is masked for non-leap years. With a hourly initial frequency, the (59*24)th column of the output always correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and the 24 columns between (59*24) and (61*24) are masked. If the original frequency is less than daily, the output is equivalent to ``series.convert('A', func=None)``. Parameters ---------- series : Series freq : string or None, default None Returns ------- annual : DataFrame """ msg = "pivot_annual is deprecated. Use pivot_table instead" warnings.warn(msg, FutureWarning) index = series.index year = index.year years = algorithms.unique1d(year) if freq is not None: freq = freq.upper() else: freq = series.index.freq if freq == 'D': width = 366 offset = np.asarray(index.dayofyear) - 1 # adjust for leap year offset[(~isleapyear(year)) & (offset >= 59)] += 1 columns = lrange(1, 367) # todo: strings like 1/1, 1/25, etc.? elif freq in ('M', 'BM'): width = 12 offset = np.asarray(index.month) - 1 columns = lrange(1, 13) elif freq == 'H': width = 8784 grouped = series.groupby(series.index.year) defaulted = grouped.apply(lambda x: x.reset_index(drop=True)) defaulted.index = defaulted.index.droplevel(0) offset = np.asarray(defaulted.index) offset[~isleapyear(year) & (offset >= 1416)] += 24 columns = lrange(1, 8785) else: raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) values.put(flat_index, series.values) return DataFrame(values, index=years, columns=columns)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex, PeriodIndex # handling two possibilities here # - for a numpy datetimelike simply view as i8 then cast back # - for an extension datetimelike view as i8 then # reconstruct from boxed values to transfer metadata dtype = None if needs_i8_conversion(values): if is_period_dtype(values): values = PeriodIndex(values) vals = values.asi8 elif is_datetimetz(values): values = DatetimeIndex(values) vals = values.asi8 else: # numpy dtype dtype = values.dtype vals = values.view(np.int64) else: vals = np.asarray(values) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() check_nulls = not is_integer_dtype(values) labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if dtype is not None: uniques = uniques.astype(dtype) if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques