def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): # Indexing on codes is more efficient if categories are the same: if target.categories is self.categories: target = target.codes indexer, missing = self._engine.get_indexer_non_unique(target) return _ensure_platform_int(indexer), missing target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer), missing
def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): # Indexing on codes is more efficient if categories are the same: if target.categories is self.categories: target = target.codes indexer, missing = self._engine.get_indexer_non_unique(target) return _ensure_platform_int(indexer), missing target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer), missing
def _make_selectors(self): new_levels = self.new_index_levels # make the mask remaining_labels = self.sorted_labels[:-1] level_sizes = [len(x) for x in new_levels] comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ValueError('Index contains duplicate entries, ' 'cannot reshape') self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = _ensure_platform_int(indices) left, right = self.left, self.right if fill_value is None: fill_value = self._na_value mask = indices == -1 if not mask.any(): # we won't change dtype here in this case # if we don't need allow_fill = False taker = lambda x: x.take(indices, allow_fill=allow_fill, fill_value=fill_value) try: new_left = taker(left) new_right = taker(right) except ValueError: # we need to coerce; migth have NA's in an # integer dtype new_left = taker(left.astype(float)) new_right = taker(right.astype(float)) return self._shallow_copy(new_left, new_right)
def get_group_index_sorter(group_index, ngroups): """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) that is, linear in the number of combinations (cartesian product) of unique values of groupby keys. This can be huge when doing multi-key groupby. np.argsort(kind='mergesort') is O(count x log(count)) where count is the length of the data-frame; Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters do_groupsort = (count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))) if do_groupsort: sorter, _ = algos.groupsort_indexer(_ensure_int64(group_index), ngroups) return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort')
def _make_selectors(self): new_levels = self.new_index_levels # make the mask remaining_labels = self.sorted_labels[:-1] level_sizes = [len(x) for x in new_levels] comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ValueError('Index contains duplicate entries, ' 'cannot reshape') self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def get_indexer(self, target, method=None, limit=None, tolerance=None): self._check_method(method) target = _ensure_index(target) target = self._maybe_cast_indexed(target) if self.equals(target): return np.arange(len(self), dtype='intp') if self.is_non_overlapping_monotonic: start, stop = self._find_non_overlapping_monotonic_bounds(target) start_plus_one = start + 1 if not ((start_plus_one < stop).any()): return np.where(start_plus_one == stop, start, -1) if not self.is_unique: raise ValueError("cannot handle non-unique indices") # IntervalIndex if isinstance(target, IntervalIndex): indexer = self._get_reindexer(target) # non IntervalIndex else: indexer = np.concatenate([self.get_loc(i) for i in target]) return _ensure_platform_int(indexer)
def get_indexer(self, target, method=None, limit=None, tolerance=None): self._check_method(method) target = _ensure_index(target) target = self._maybe_cast_indexed(target) if self.equals(target): return np.arange(len(self), dtype='intp') if self.is_non_overlapping_monotonic: start, stop = self._find_non_overlapping_monotonic_bounds(target) start_plus_one = start + 1 if not ((start_plus_one < stop).any()): return np.where(start_plus_one == stop, start, -1) if not self.is_unique: raise ValueError("cannot handle non-unique indices") # IntervalIndex if isinstance(target, IntervalIndex): indexer = self._get_reindexer(target) # non IntervalIndex else: indexer = np.concatenate([self.get_loc(i) for i in target]) return _ensure_platform_int(indexer)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = _ensure_platform_int(indices) left, right = self.left, self.right if fill_value is None: fill_value = self._na_value mask = indices == -1 if not mask.any(): # we won't change dtype here in this case # if we don't need allow_fill = False taker = lambda x: x.take(indices, allow_fill=allow_fill, fill_value=fill_value) try: new_left = taker(left) new_right = taker(right) except ValueError: # we need to coerce; migth have NA's in an # integer dtype new_left = taker(left.astype(float)) new_right = taker(right.astype(float)) return self._shallow_copy(new_left, new_right)
def get_group_index_sorter(group_index, ngroups): """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where ngroups = prod(shape) shape = map(len, keys) that is, linear in the number of combinations (cartesian product) of unique values of groupby keys. This can be huge when doing multi-key groupby. np.argsort(kind='mergesort') is O(count x log(count)) where count is the length of the data-frame; Both algorithms are `stable` sort and that is necessary for correctness of groupby operations. e.g. consider: df.groupby(key)[col].transform('first') """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters do_groupsort = (count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))) if do_groupsort: sorter, _ = algos.groupsort_indexer(_ensure_int64(group_index), ngroups) return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort')
def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if self.is_unique and self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): # we have the same codes codes = target.codes else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) codes = take_1d(code_indexer, target.codes, fill_value=-1) else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def get_indexer(self, target, method=None, limit=None, tolerance=None): from pandas.core.arrays.categorical import _recode_for_categories method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if self.is_unique and self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): if self.values.equals(target.values): # we have the same codes codes = target.codes else: codes = _recode_for_categories(target.codes, target.categories, self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) codes = take_1d(code_indexer, target.codes, fill_value=-1) else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 indexer = _ensure_platform_int(indexer) out = self.values.take(indexer) out[mask] = self._na_value return type(self)(out)
def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer), missing
def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer), missing
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=-1) return self._create_from_codes(taken)
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): """ Sparse-compatible version of ndarray.take Returns ------- taken : ndarray """ nv.validate_take(tuple(), kwargs) if axis: raise ValueError( "axis must be 0, input was {axis}".format(axis=axis)) if is_integer(indices): # return scalar return self[indices] indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, # self.fill_value may not be NaN if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) elif (n <= indices).any(): msg = 'index is out of bounds for size {size}'.format(size=n) raise IndexError(msg) else: if ((indices < -n) | (n <= indices)).any(): msg = 'index is out of bounds for size {size}'.format(size=n) raise IndexError(msg) indices = indices.astype(np.int32) if not (allow_fill and fill_value is not None): indices = indices.copy() indices[indices < 0] += n locs = self.sp_index.lookup_array(indices) indexer = np.arange(len(locs), dtype=np.int32) mask = locs != -1 if mask.any(): indexer = indexer[mask] new_values = self.sp_values.take(locs[mask]) else: indexer = np.empty(shape=(0, ), dtype=np.int32) new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) sp_index = _make_index(len(indices), indexer, kind=self.sp_index) return self._simple_new(new_values, sp_index, self.fill_value)
def test_transform_fast(self): df = DataFrame({ 'id': np.arange(100000) / 3, 'val': np.random.randn(100000) }) grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) # GH 12737 df = pd.DataFrame( { 'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], 'd': pd.date_range('2014-1-1', '2014-1-4'), 'i': [1, 2, 3, 4] }, columns=['grouping', 'f', 'i', 'd']) result = df.groupby('grouping').transform('first') dates = [ pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4') ] expected = pd.DataFrame( { 'f': [1.1, 2.1, 2.1, 4.5], 'd': dates, 'i': [1, 2, 2, 4] }, columns=['f', 'i', 'd']) assert_frame_equal(result, expected) # selection result = df.groupby('grouping')[['f', 'i']].transform('first') expected = expected[['f', 'i']] assert_frame_equal(result, expected) # dup columns df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) result = df.groupby('g').transform('first') expected = df.drop('g', axis=1) assert_frame_equal(result, expected)
def get_group_levels(self): if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].result_index] name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): labels = _ensure_platform_int(labels) levels = ping.result_index.take(labels) name_list.append(levels) return name_list
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): """ Sparse-compatible version of ndarray.take Returns ------- taken : ndarray """ nv.validate_take(tuple(), kwargs) if axis: raise ValueError("axis must be 0, input was {axis}" .format(axis=axis)) if is_integer(indices): # return scalar return self[indices] indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, # self.fill_value may not be NaN if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) elif (n <= indices).any(): msg = 'index is out of bounds for size {size}'.format(size=n) raise IndexError(msg) else: if ((indices < -n) | (n <= indices)).any(): msg = 'index is out of bounds for size {size}'.format(size=n) raise IndexError(msg) indices = indices.astype(np.int32) if not (allow_fill and fill_value is not None): indices = indices.copy() indices[indices < 0] += n locs = self.sp_index.lookup_array(indices) indexer = np.arange(len(locs), dtype=np.int32) mask = locs != -1 if mask.any(): indexer = indexer[mask] new_values = self.sp_values.take(locs[mask]) else: indexer = np.empty(shape=(0, ), dtype=np.int32) new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) sp_index = _make_index(len(indices), indexer, kind=self.sp_index) return self._simple_new(new_values, sp_index, self.fill_value)
def take(self, indexer, allow_fill=True, fill_value=None): indexer = np.asarray(indexer) mask = indexer == -1 # take on empty array not handled as desired by numpy in case of -1 if not len(self) and mask.all(): return type(self)([self._na_value] * len(indexer)) indexer = _ensure_platform_int(indexer) out = self.values.take(indexer) out[mask] = self._na_value return type(self)(out)
def group_info(self): ngroups = self.ngroups obs_group_ids = np.arange(ngroups) rep = np.diff(np.r_[0, self.bins]) rep = _ensure_platform_int(rep) if ngroups == len(self.bins): comp_ids = np.repeat(np.arange(ngroups), rep) else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) return comp_ids.astype('int64', copy=False), \ obs_group_ids.astype('int64', copy=False), ngroups
def size(self): """ Compute group sizes """ ids, _, ngroup = self.group_info ids = _ensure_platform_int(ids) if ngroup: out = np.bincount(ids[ids != -1], minlength=ngroup) else: out = ids return Series(out, index=self.result_index, dtype='int64')
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return self._constructor(index=index, columns=self.columns).__finalize__(self) indexer = self.index.get_indexer(index, method, limit=limit) indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): if mask.all(): continue values = series.values # .take returns SparseArray new = values.take(indexer) if need_mask: new = new.values # convert integer to float if necessary. need to do a lot # more than that, handle boolean etc also new, fill_value = maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new return self._constructor( new_series, index=index, columns=self.columns, default_fill_value=self._default_fill_value).__finalize__(self)
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def test_transform_fast(self): df = DataFrame({'id': np.arange(100000) / 3, 'val': np.random.randn(100000)}) grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) # GH 12737 df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], 'd': pd.date_range('2014-1-1', '2014-1-4'), 'i': [1, 2, 3, 4]}, columns=['grouping', 'f', 'i', 'd']) result = df.groupby('grouping').transform('first') dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], 'd': dates, 'i': [1, 2, 2, 4]}, columns=['f', 'i', 'd']) assert_frame_equal(result, expected) # selection result = df.groupby('grouping')[['f', 'i']].transform('first') expected = expected[['f', 'i']] assert_frame_equal(result, expected) # dup columns df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) result = df.groupby('g').transform('first') expected = df.drop('g', axis=1) assert_frame_equal(result, expected)
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return self._constructor( index=index, columns=self.columns).__finalize__(self) indexer = self.index.get_indexer(index, method, limit=limit) indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): if mask.all(): continue values = series.values # .take returns SparseArray new = values.take(indexer) if need_mask: new = new.values # convert integer to float if necessary. need to do a lot # more than that, handle boolean etc also new, fill_value = maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new return self._constructor( new_series, index=index, columns=self.columns, default_fill_value=self._default_fill_value).__finalize__(self)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if not isinstance(values, np.ndarray): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = _ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = _ensure_platform_int(t.lookup(ordered)) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) # (Out of bound indices will be masked with `na_sentinel` next, so we may # deal with them here without performance loss using `mode='wrap'`.) new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) return ordered, _ensure_platform_int(new_labels)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if not isinstance(values, np.ndarray): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = _ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo(values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = _ensure_platform_int(t.lookup(ordered)) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) # (Out of bound indices will be masked with `na_sentinel` next, so we may # deal with them here without performance loss using `mode='wrap'`.) new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) return ordered, _ensure_platform_int(new_labels)
def pivot_annual(series, freq=None): """ Deprecated. Use ``pivot_table`` instead. Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, and as many columns as the length of a leap year in the units corresponding to the original frequency (366 for daily frequency, 366*24 for hourly...). The fist column of the output corresponds to Jan. 1st, 00:00:00, while the last column corresponds to Dec, 31st, 23:59:59. Entries corresponding to Feb. 29th are masked for non-leap years. For example, if the initial series has a daily frequency, the 59th column of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, and the 60th column is masked for non-leap years. With a hourly initial frequency, the (59*24)th column of the output always correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and the 24 columns between (59*24) and (61*24) are masked. If the original frequency is less than daily, the output is equivalent to ``series.convert('A', func=None)``. Parameters ---------- series : Series freq : string or None, default None Returns ------- annual : DataFrame """ msg = "pivot_annual is deprecated. Use pivot_table instead" warnings.warn(msg, FutureWarning) index = series.index year = index.year years = algorithms.unique1d(year) if freq is not None: freq = freq.upper() else: freq = series.index.freq if freq == 'D': width = 366 offset = np.asarray(index.dayofyear) - 1 # adjust for leap year offset[(~isleapyear(year)) & (offset >= 59)] += 1 columns = lrange(1, 367) # todo: strings like 1/1, 1/25, etc.? elif freq in ('M', 'BM'): width = 12 offset = np.asarray(index.month) - 1 columns = lrange(1, 13) elif freq == 'H': width = 8784 grouped = series.groupby(series.index.year) defaulted = grouped.apply(lambda x: x.reset_index(drop=True)) defaulted.index = defaulted.index.droplevel(0) offset = np.asarray(defaulted.index) offset[~isleapyear(year) & (offset >= 1416)] += 24 columns = lrange(1, 8785) else: raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) values.put(flat_index, series.values) return DataFrame(values, index=years, columns=columns)