def _get_table_for_categories(values, categories): # ripped out of _get_codes_for_values() in pandas Categorical module if not pd_cat_module.is_dtype_equal(values.dtype, categories.dtype): values = ensure_object(values) categories = ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) (_, _), cats = _get_data_algo(categories, _hashtables) t = hash_klass(len(cats)) t.map_locations(cats) return t
def _get_codes_for_values(values, levels): from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != levels.dtype: values = com._ensure_object(values) levels = com._ensure_object(levels) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(levels)) t.map_locations(levels) return com._ensure_platform_int(t.lookup(values))
def _get_codes_for_values(values, categories): """" utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != categories.dtype: values = com._ensure_object(values) categories = com._ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(categories)) t.map_locations(com._values_from_object(categories)) return com._ensure_platform_int(t.lookup(values))
def _get_codes_for_values(values, levels): """" utility routine to turn values into codes given the specified levels """ from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != levels.dtype: values = com._ensure_object(values) levels = com._ensure_object(levels) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(levels)) t.map_locations(com._values_from_object(levels)) return com._ensure_platform_int(t.lookup(values))
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError( "Only list-like objects are allowed to be passed to" "safe_sort as values" ) if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if ( not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError( "Only list-like objects or None are allowed to be" "passed to safe_sort as labels" ) labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( values, algorithms._hashtables ) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` new_labels = reverse_indexer.take(labels, mode="wrap") mask = labels == na_sentinel if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if not isinstance(values, np.ndarray): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = (labels < -len(values)) | (labels >= len(values)) | \ (labels == na_sentinel) # (Out of bound indices will be masked with `na_sentinel` next, so we may # deal with them here without performance loss using `mode='wrap'`.) new_labels = reverse_indexer.take(labels, mode='wrap') np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)