def test_1d_bool(self): arr = np.array([0, 1, 0], dtype=bool) result = algos.take_1d(arr, [0, 2, 2, 1]) expected = arr.take([0, 2, 2, 1]) tm.assert_numpy_array_equal(result, expected) result = algos.take_1d(arr, [0, 2, -1]) assert result.dtype == np.object_
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): return NotImplemented time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name, na_op) if time_converted is None: lvalues, rvalues = left, right dtype = None wrap_results = lambda x: x elif time_converted is NotImplemented: return NotImplemented else: left, right = time_converted.left, time_converted.right lvalues, rvalues = time_converted.lvalues, time_converted.rvalues dtype = time_converted.dtype wrap_results = time_converted.wrap_results na_op = time_converted.na_op if isinstance(rvalues, ABCSeries): rindex = getattr(rvalues, 'index', rvalues) name = _maybe_match_name(left, rvalues) lvalues = getattr(lvalues, 'values', lvalues) rvalues = getattr(rvalues, 'values', rvalues) if left.index.equals(rindex): index = left.index else: index, lidx, ridx = left.index.join(rindex, how='outer', return_indexers=True) if lidx is not None: lvalues = algos.take_1d(lvalues, lidx) if ridx is not None: rvalues = algos.take_1d(rvalues, ridx) arr = na_op(lvalues, rvalues) return left._constructor(wrap_results(arr), index=index, name=name, dtype=dtype) else: # scalars if (hasattr(lvalues, 'values') and not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values return left._constructor(wrap_results(na_op(lvalues, rvalues)), index=left.index, name=left.name, dtype=dtype)
def _maybe_add_join_keys(self, result, left_indexer, right_indexer): # insert group keys keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue if name in result: key_indexer = result.columns.get_loc(name) if left_indexer is not None and right_indexer is not None: if name in self.left: if len(self.left) == 0: continue na_indexer = (left_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue right_na_indexer = right_indexer.take(na_indexer) result.iloc[na_indexer, key_indexer] = ( algos.take_1d(self.right_join_keys[i], right_na_indexer)) elif name in self.right: if len(self.right) == 0: continue na_indexer = (right_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue left_na_indexer = left_indexer.take(na_indexer) result.iloc[na_indexer, key_indexer] = ( algos.take_1d(self.left_join_keys[i], left_na_indexer)) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): if name is None: name = 'key_%d' % i # a faster way? key_col = algos.take_1d(self.left_join_keys[i], left_indexer) na_indexer = (left_indexer == -1).nonzero()[0] right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, algos.take_1d(self.right_join_keys[i], right_na_indexer)) result.insert(i, name, key_col)
def test_1d_fill_nonna(self, dtype_fill_out_dtype): dtype, fill_value, out_dtype = dtype_fill_out_dtype data = np.random.randint(0, 2, 4).astype(dtype) indexer = [2, 1, 0, -1] result = algos.take_1d(data, indexer, fill_value=fill_value) assert ((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) assert (result[3] == fill_value) assert (result.dtype == out_dtype) indexer = [2, 1, 0, 1] result = algos.take_1d(data, indexer, fill_value=fill_value) assert ((result[[0, 1, 2, 3]] == data[indexer]).all()) assert (result.dtype == dtype)
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result result = np.asarray(result) # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def get_indexer(self, target, method=None, limit=None, tolerance=None): from pandas.core.arrays.categorical import _recode_for_categories method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if self.is_unique and self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): if self.values.equals(target.values): # we have the same codes codes = target.codes else: codes = _recode_for_categories(target.codes, target.categories, self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) codes = take_1d(code_indexer, target.codes, fill_value=-1) else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, copy=False, allow_dups=False): if method is not None or limit is not None: raise NotImplementedError("cannot reindex with a method or limit " "with sparse") if fill_value is None: fill_value = np.nan index, row_indexer = reindexers.get(0, (None, None)) columns, col_indexer = reindexers.get(1, (None, None)) if columns is None: columns = self.columns new_arrays = {} for col in columns: if col not in self: continue if row_indexer is not None: new_arrays[col] = algos.take_1d(self[col].get_values(), row_indexer, fill_value=fill_value) else: new_arrays[col] = self[col] return self._constructor(new_arrays, index=index, columns=columns).__finalize__(self)
def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: columns = _get_objs_combined_axis(data, sort=False) indexer_cache = {} aligned_values = [] for s in data: index = getattr(s, 'index', None) if index is None: index = ibase.default_index(len(s)) if id(index) in indexer_cache: indexer = indexer_cache[id(index)] else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) values = com.values_from_object(s) aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) if values.dtype == np.object_: content = list(values.T) return _convert_object_array(content, columns, dtype=dtype, coerce_float=coerce_float) else: return values.T, columns
def test_1d_other_dtypes(self): arr = np.random.randn(10).astype(np.float32) indexer = [1, 2, 3, -1] result = algos.take_1d(arr, indexer) expected = arr.take(indexer) expected[-1] = np.nan tm.assert_almost_equal(result, expected)
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame if isinstance(obj, Series): new_values = algos.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") return DataFrame(obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)) else: raise ValueError("'obj' should be either a Series or a DataFrame")
def test_1d_with_out(self, dtype_can_hold_na, writeable): dtype, can_hold_na = dtype_can_hold_na data = np.random.randint(0, 2, 4).astype(dtype) data.flags.writeable = writeable indexer = [2, 1, 0, 1] out = np.empty(4, dtype=dtype) algos.take_1d(data, indexer, out=out) expected = data.take(indexer) tm.assert_almost_equal(out, expected) indexer = [2, 1, 0, -1] out = np.empty(4, dtype=dtype) if can_hold_na: algos.take_1d(data, indexer, out=out) expected = data.take(indexer) expected[3] = np.nan tm.assert_almost_equal(out, expected) else: with pytest.raises(TypeError, match=self.fill_error): algos.take_1d(data, indexer, out=out) # No Exception otherwise. data.take(indexer, out=out)
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError If any of the categoricals are ordered or all do not have the same dtype ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError("No Categoricals to union") first = to_union[0] if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def _test_dtype(dtype, can_hold_na, writeable=True): data = np.random.randint(0, 2, 4).astype(dtype) data.flags.writeable = writeable indexer = [2, 1, 0, 1] out = np.empty(4, dtype=dtype) algos.take_1d(data, indexer, out=out) expected = data.take(indexer) tm.assert_almost_equal(out, expected) indexer = [2, 1, 0, -1] out = np.empty(4, dtype=dtype) if can_hold_na: algos.take_1d(data, indexer, out=out) expected = data.take(indexer) expected[3] = np.nan tm.assert_almost_equal(out, expected) else: with tm.assertRaisesRegexp(TypeError, self.fill_error): algos.take_1d(data, indexer, out=out) # no exception o/w data.take(indexer, out=out)
def parallel_take1d(): take_1d(df["col"].values, indexer)
def _map_values(self, mapper, na_action=None): """An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- applied : Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if isinstance(mapper, dict): if hasattr(mapper, '__missing__'): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples from pandas import Series mapper = Series(mapper) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_extension_type(self.dtype): values = self._values else: values = self.values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) return new_values # we must convert to python types if is_extension_type(self.dtype): values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self.astype(object) values = getattr(values, 'values', values) if na_action == 'ignore': def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) else: map_f = lib.map_infer # mapper is a function new_values = map_f(values, mapper) return new_values
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series from pandas.core.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: new_codes.append( _recode_for_categories(c.codes, c.categories, categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well mapper = create_series_with_explicit_dtype( mapper, dtype_if_empty=np.float64 ) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self._values): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) if is_extension_array_dtype(self.dtype): values = self._values else: values = self.values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self.astype(object) values = getattr(values, "values", values) if na_action == "ignore": def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) else: map_f = lib.map_infer # mapper is a function new_values = map_f(values, mapper) return new_values
def parallel_take1d(): take_1d(df['col'].values, indexer)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed """ from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def get_level_values(num): unique = vals.levels[num] # .values labels = vals.labels[num] filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) return filled
def _get_mgr_concatenation_plan(mgr, indexers): """ Construct concatenation plan for given block manager and indexers. Parameters ---------- mgr : BlockManager indexers : dict of {axis: indexer} Returns ------- plan : list of (BlockPlacement, JoinUnit) tuples """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. mgr_shape_list = list(mgr.shape) for ax, indexer in indexers.items(): mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) if 0 in indexers: ax0_indexer = indexers.pop(0) blknos = algos.take_1d(mgr.blknos, ax0_indexer, fill_value=-1) blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1) else: if mgr._is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] ax0_indexer = None blknos = mgr.blknos blklocs = mgr.blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): assert placements.is_slice_like join_unit_indexers = indexers.copy() shape_list = list(mgr_shape) shape_list[0] = len(placements) shape = tuple(shape_list) if blkno == -1: unit = JoinUnit(None, shape) else: blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] unit_no_ax0_reindexing = ( len(placements) == len(blk.mgr_locs) and # Fastpath detection of join unit not # needing to reindex its block: no ax0 # reindexing took place and block # placement was sequential before. ((ax0_indexer is None and blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1) or # Slow-ish detection: all indexer locs # are sequential (and length match is # checked above). (np.diff(ax0_blk_indexer) == 1).all())) # Omit indexer if no item reindexing is required. if unit_no_ax0_reindexing: join_unit_indexers.pop(0, None) else: join_unit_indexers[0] = ax0_blk_indexer unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) return plan
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed """ from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well mapper = create_series_with_explicit_dtype( mapper, dtype_if_empty=np.float64) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values # pandas\core\base.py:893: error: Incompatible types in # assignment (expression has type "Categorical", variable has # type "IndexOpsMixin") [assignment] self = cast("Categorical", self) # type: ignore[assignment] # pandas\core\base.py:894: error: Item "ExtensionArray" of # "Union[ExtensionArray, Any]" has no attribute "map" # [union-attr] return self._values.map(mapper) # type: ignore[union-attr] values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr( self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: # pandas\core\base.py:1142: error: "IndexOpsMixin" has no attribute # "astype" [attr-defined] values = self.astype(object)._values # type: ignore[attr-defined] if na_action == "ignore": map_f = lambda values, f: lib.map_infer_mask( values, f, isna(values).view(np.uint8)) elif na_action is None: map_f = lib.map_infer else: msg = ("na_action must either be 'ignore' or None, " f"{na_action} was passed") raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) return new_values
def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) return unique._shallow_copy(filled, name=index.names[level])
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series from pandas.core.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: new_codes.append(_recode_for_categories(c.codes, c.categories, categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if not all( is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") if all(first.is_dtype_equal(other) for other in to_union[1:]): return Categorical(np.concatenate([c.codes for c in to_union]), categories=first.categories, ordered=first.ordered, fastpath=True) elif all(not c.ordered for c in to_union): # not ordered pass else: # to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def _(groupby): src = groupby._orig_obj ids, _, ngroup = groupby._orig_grouper.group_info out = algorithms.take_1d(groupby.obj._values, ids) return Series(out, index=src.index, name=src.name)
def get_mgr_concatenation_plan(mgr, indexers): """ Construct concatenation plan for given block manager and indexers. Parameters ---------- mgr : BlockManager indexers : dict of {axis: indexer} Returns ------- plan : list of (BlockPlacement, JoinUnit) tuples """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. mgr_shape = list(mgr.shape) for ax, indexer in indexers.items(): mgr_shape[ax] = len(indexer) mgr_shape = tuple(mgr_shape) if 0 in indexers: ax0_indexer = indexers.pop(0) blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) else: if mgr._is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] ax0_indexer = None blknos = mgr._blknos blklocs = mgr._blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, mgr.nblocks, group=False): assert placements.is_slice_like join_unit_indexers = indexers.copy() shape = list(mgr_shape) shape[0] = len(placements) shape = tuple(shape) if blkno == -1: unit = JoinUnit(None, shape) else: blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and # Fastpath detection of join unit not # needing to reindex its block: no ax0 # reindexing took place and block # placement was sequential before. ((ax0_indexer is None and blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1) or # Slow-ish detection: all indexer locs # are sequential (and length match is # checked above). (np.diff(ax0_blk_indexer) == 1).all())) # Omit indexer if no item reindexing is required. if unit_no_ax0_reindexing: join_unit_indexers.pop(0, None) else: join_unit_indexers[0] = ax0_blk_indexer unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) return plan
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") if all(first.is_dtype_equal(other) for other in to_union[1:]): return Categorical(np.concatenate([c.codes for c in to_union]), categories=first.categories, ordered=first.ordered, fastpath=True) elif all(not c.ordered for c in to_union): # not ordered pass else: # to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if (not isinstance(values, np.ndarray) and not is_extension_array_dtype(values)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if (not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo(values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` new_labels = reverse_indexer.take(labels, mode='wrap') mask = labels == na_sentinel if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if (not isinstance(values, np.ndarray) and not is_extension_array_dtype(values)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if (not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` new_labels = reverse_indexer.take(labels, mode='wrap') mask = labels == na_sentinel if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)
def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None right_has_missing = None keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue take_left, take_right = None, None if name in result: if left_indexer is not None and right_indexer is not None: if name in self.left: if left_has_missing is None: left_has_missing = any(left_indexer == -1) if left_has_missing: take_right = self.right_join_keys[i] if not com.is_dtype_equal(result[name].dtype, self.left[name].dtype): take_left = self.left[name]._values elif name in self.right: if right_has_missing is None: right_has_missing = any(right_indexer == -1) if right_has_missing: take_left = self.left_join_keys[i] if not com.is_dtype_equal(result[name].dtype, self.right[name].dtype): take_right = self.right[name]._values elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): take_left = self.left_join_keys[i] take_right = self.right_join_keys[i] if take_left is not None or take_right is not None: if take_left is None: lvals = result[name]._values else: lfill = na_value_for_dtype(take_left.dtype) lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) if take_right is None: rvals = result[name]._values else: rfill = na_value_for_dtype(take_right.dtype) rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer # make sure to just use the right values mask = left_indexer == -1 if mask.all(): key_col = rvals else: key_col = Index(lvals).where(~mask, rvals) if name in result: result[name] = key_col else: result.insert(i, name or 'key_%d' % i, key_col)