def from_array(cls, data): try: labels, levels, _ = factorize(data, sort=True) except TypeError: labels, levels, _ = factorize(data, sort=False) return Categorical(labels, levels, name=getattr(data, "name", None))
def test_datelike(self): # M8 v1 = pd.Timestamp('20130101 09:00:00.00004') v2 = pd.Timestamp('20130101') x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) self.assert_numpy_array_equal(labels, np.array( [0, 0, 0, 1, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array( [v1.value, v2.value], dtype='M8[ns]')) labels, uniques = algos.factorize(x, sort=True) self.assert_numpy_array_equal(labels, np.array( [1, 1, 1, 0, 0, 1], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array( [v2.value, v1.value], dtype='M8[ns]')) # period v1 = pd.Period('201302', freq='M') v2 = pd.Period('201303', freq='M') x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index labels, uniques = algos.factorize(x) self.assert_numpy_array_equal(labels, np.array( [0, 0, 0, 1, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2])) labels, uniques = algos.factorize(x, sort=True) self.assert_numpy_array_equal(labels, np.array( [0, 0, 0, 1, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
def from_array(cls, data): try: labels, levels, _ = factorize(data, sort=True) except TypeError: labels, levels, _ = factorize(data, sort=False) return Factor(labels, levels)
def from_array(cls, data): from pandas.core.algorithms import factorize try: labels, levels, _ = factorize(data, sort=True) except TypeError: labels, levels, _ = factorize(data, sort=False) return Factor(labels, levels)
def from_array(cls, data): if isinstance(data, Index) and hasattr(data, 'factorize'): labels, levels = data.factorize() else: try: labels, levels = factorize(data, sort=True) except TypeError: labels, levels = factorize(data, sort=False) return Categorical(labels, levels, name=getattr(data, 'name', None))
def __init__(self, labels, levels=None, name=None): if levels is None: if name is None: name = getattr(labels, 'name', None) try: labels, levels = factorize(labels, sort=True) except TypeError: labels, levels = factorize(labels, sort=False) self.labels = labels self.levels = levels self.name = name
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) self.assert_numpy_array_equal(labels, np.array([ 0, 0, -1, 1, 2, 3],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(['A', 'B', 3.14, np.inf], dtype=object)) labels, uniques = algos.factorize(x, sort=True) self.assert_numpy_array_equal(labels, np.array([ 2, 2, -1, 3, 0, 1],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([3.14, np.inf, 'A', 'B'], dtype=object))
def __new__(cls, data): from pandas.core.index import _ensure_index from pandas.core.algorithms import factorize try: labels, levels, _ = factorize(data, sort=True) except TypeError: labels, levels, _ = factorize(data, sort=False) labels = labels.view(Factor) labels.levels = _ensure_index(levels) return labels
def __init__(self, labels, levels=None, name=None): if levels is None: if name is None: name = getattr(labels, 'name', None) if isinstance(labels, Index) and hasattr(labels, 'factorize'): labels, levels = labels.factorize() else: try: labels, levels = factorize(labels, sort=True) except TypeError: labels, levels = factorize(labels, sort=False) self.labels = labels self.levels = levels self.name = name
def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes cat = self.grouper categories = cat.categories if self.observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] if self.sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) uniques = Categorical.from_codes( codes=ucodes, categories=categories, ordered=cat.ordered ) return cat.codes, uniques elif isinstance(self.grouper, ops.BaseGrouper): # we have a list of groupers codes = self.grouper.codes_info uniques = self.grouper.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None if not self.dropna: na_sentinel = None else: na_sentinel = -1 codes, uniques = algorithms.factorize( self.grouper, sort=self.sort, na_sentinel=na_sentinel ) return codes, uniques
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) self.assert_numpy_array_equal(labels, np.array( [0, 0, -1, 1, 2, 3], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array( ['A', 'B', 3.14, np.inf], dtype=object)) labels, uniques = algos.factorize(x, sort=True) self.assert_numpy_array_equal(labels, np.array( [2, 2, -1, 3, 0, 1], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array( [3.14, np.inf, 'A', 'B'], dtype=object))
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) exp = pd.Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) exp = pd.Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp)
def factorize(self): """ Specialized factorize that boxes uniques """ from pandas.core.algorithms import factorize labels, uniques = factorize(self.values) uniques = PeriodIndex(ordinal=uniques, freq=self.freq) return labels, uniques
def factorize(self): """ Specialized factorize that boxes uniques """ from pandas.core.algorithms import factorize labels, uniques, counts = factorize(self.values) uniques = PeriodIndex(ordinal=uniques, freq=self.freq) return labels, uniques
def factorize( self, sort: bool = False, na_sentinel: int | lib.NoDefault = lib.no_default, use_na_sentinel: bool | lib.NoDefault = lib.no_default, ): return algorithms.factorize( self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel )
def factorize(self, na_sentinel=-1): # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) uniques = SparseArray(uniques, dtype=self.dtype) return codes, uniques
def from_array(cls, data): """ Make a Categorical type from a single array-like object. Parameters ---------- data : array-like Can be an Index or array-like. The levels are assumed to be the unique values of `data`. """ if isinstance(data, Index) and hasattr(data, 'factorize'): labels, levels = data.factorize() else: try: labels, levels = factorize(data, sort=True) except TypeError: labels, levels = factorize(data, sort=False) return Categorical(labels, levels, name=getattr(data, 'name', None))
def _make_labels(self): if self._was_factor: # pragma: no cover raise Exception('Should not call this method grouping by level') else: labs, uniques, counts = algos.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labs self._group_index = uniques self._counts = counts
def from_array(cls, data): """ Make a Categorical type from a single array-like object. Parameters ---------- data : array-like Can be an Index or array-like. The levels are assumed to be the unique values of `data`. """ if isinstance(data, Index) and hasattr(data, "factorize"): labels, levels = data.factorize() else: try: labels, levels = factorize(data, sort=True) except TypeError: labels, levels = factorize(data, sort=False) return Categorical(labels, levels, name=getattr(data, "name", None))
def _make_labels(self): if self._labels is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, BaseGrouper): labels = self.grouper.label_info uniques = self.grouper.result_index else: labels, uniques = algorithms.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques
def _make_codes(self) -> None: if self._codes is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, ops.BaseGrouper): codes = self.grouper.codes_info uniques = self.grouper.result_index else: codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._codes = codes self._group_index = uniques
def _make_labels(self): if self._labels is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, BaseGrouper): labels = self.grouper.label_info uniques = self.grouper.result_index else: labels, uniques = algorithms.factorize( self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques
def indices(self): # we have a list of groupers if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices # Return a dictionary of {group label: [indices belonging to the group label]} # respecting whether sort was specified codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) return { category: np.flatnonzero(codes == i) for i, category in enumerate(Index(uniques)) }
def test_basic(self): labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object)) labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], sort=True) self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object)) labels, uniques = algos.factorize(list(reversed(range(5)))) self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) self.assert_numpy_array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64)) self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True) self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
def test_basic(self): labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"], sort=True) self.assert_numpy_array_equal(labels, np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) labels, uniques = algos.factorize(list(reversed(range(5)))) self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) self.assert_numpy_array_equal(labels, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)) self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64))
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "RLEArray"]: # optimized version of `ExtensionArray.factorize`: # 1. replace `_values_for_factorize` with a version that does not decompress the data # 2. passing compressed data to `factorize` (instead of `_factorize_array` because that does not handle NA # values nicely) # 3. decompress `codes` arr = self._data codes, uniques = factorize(arr, na_sentinel=na_sentinel) uniques = self._from_factorized(uniques, self) codes = decompress(codes, self._positions) return codes, uniques
def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if self._overflow_possible: tups = lib.fast_zip(all_labels) labs, uniques, _ = algos.factorize(tups) if self.sort: uniques, labs = _reorder_by_uniques(uniques, labs) return labs, uniques else: if len(all_labels) > 1: group_index = get_group_index(all_labels, self.shape) else: group_index = all_labels[0] comp_ids, obs_group_ids = _compress_group_index(group_index) return comp_ids, obs_group_ids
def _make_codes(self) -> None: if self._codes is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, ops.BaseGrouper): codes = self.grouper.codes_info uniques = self.grouper.result_index else: # GH35667, replace dropna=False with na_sentinel=None if not self.dropna: na_sentinel = None else: na_sentinel = -1 codes, uniques = algorithms.factorize(self.grouper, sort=self.sort, na_sentinel=na_sentinel) uniques = Index(uniques, name=self.name) self._codes = codes self._group_index = uniques
def factorize(self, sort=False, na_sentinel=-1): """ Encode the object as an enumerated type or categorical variable Parameters ---------- sort : boolean, default False Sort by values na_sentinel: int, default -1 Value to mark "not found" Returns ------- labels : the indexer to the original array uniques : the unique Index """ from pandas.core.algorithms import factorize return factorize(self, sort=sort, na_sentinel=na_sentinel)
def _levels_to_axis( ss, levels: tuple[int] | list[int], valid_ilocs: npt.NDArray[np.intp], sort_labels: bool = False, ) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]: """ For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`, where `ax_coords` are the coordinates along one of the two axes of the destination sparse matrix, and `ax_labels` are the labels from `ss`' Index which correspond to these coordinates. Parameters ---------- ss : Series levels : tuple/list valid_ilocs : numpy.ndarray Array of integer positions of valid values for the sparse matrix in ss. sort_labels : bool, default False Sort the axis labels before forming the sparse matrix. When `levels` refers to a single level, set to True for a faster execution. Returns ------- ax_coords : numpy.ndarray (axis coordinates) ax_labels : list (axis labels) """ # Since the labels are sorted in `Index.levels`, when we wish to sort and # there is only one level of the MultiIndex for this axis, the desired # output can be obtained in the following simpler, more efficient way. if sort_labels and len(levels) == 1: ax_coords = ss.index.codes[levels[0]][valid_ilocs] ax_labels = ss.index.levels[levels[0]] else: levels_values = lib.fast_zip( [ss.index.get_level_values(lvl).values for lvl in levels] ) codes, ax_labels = factorize(levels_values, sort=sort_labels) ax_coords = codes[valid_ilocs] ax_labels = ax_labels.tolist() return ax_coords, ax_labels
def test_datelike(self): # M8 v1 = pd.Timestamp('20130101 09:00:00.00004') v2 = pd.Timestamp('20130101') x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) exp = pd.DatetimeIndex([v1, v2]) self.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) exp = pd.DatetimeIndex([v2, v1]) self.assert_index_equal(uniques, exp) # period v1 = pd.Period('201302', freq='M') v2 = pd.Period('201303', freq='M') x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index labels, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) labels, uniques = algos.factorize(x, sort=True) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 v1 = pd.to_timedelta('1 day 1 min') v2 = pd.to_timedelta('1 day') x = Series([v1, v2, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) labels, uniques = algos.factorize(x, sort=True) exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
def _codes_and_uniques( self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes cat = self.grouping_vector categories = cat.categories if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] if self._sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) uniques = Categorical.from_codes(codes=ucodes, categories=categories, ordered=cat.ordered) return cat.codes, uniques elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers codes = self.grouping_vector.codes_info # error: Incompatible types in assignment (expression has type "Union # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical") uniques = ( self.grouping_vector.result_index. _values # type: ignore[assignment] ) else: # GH35667, replace dropna=False with use_na_sentinel=False # error: Incompatible types in assignment (expression has type "Union[ # ndarray[Any, Any], Index]", variable has type "Categorical") codes, uniques = algorithms.factorize( # type: ignore[assignment] self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna) return codes, uniques
def level_styler(linestyle=None, color=None, marker=None): """ This function is useful for categorical plotting. Based on certain categories, it will return styles that are persistant. This is when you want to distinguish groups of line plots by their style """ vars = locals().copy() styles = OrderedDict() for k, SC in STYLES.items(): vals = vars.get(k, None) if vals is None: continue labels, uniques = factorize(vals) labels = labels % len(SC) # cycle back to start style_values = np.take(SC, labels) styles[k] = style_values keys = styles.keys() return [dict(zip(keys, st)) for st in itertools.izip(*styles.values())]
def f(vals): labels, shape = _algorithms.factorize(vals, size_hint=min( len(df), SIZE_HINT_LIMIT)) return labels.astype("i8", copy=False), len(shape)
def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, levels=None): if fastpath: # fast path self._codes = values self.name = name self.categories = categories self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # TODO: Remove after deprecation period in 2017/ after 0.18 if not levels is None: warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", FutureWarning) if categories is None: categories = levels else: raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " "use only 'categories'") # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if categories is None: categories = cat.categories if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the category # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if categories is None: try: codes, categories = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, categories = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the user should # give us one by specifying categories raise TypeError("'values' is not ordered, please explicitly specify the " "categories order by passing in a categories argument.") else: # there were two ways if categories are present # - the old one, where each value is a int pointer to the levels array -> not anymore # possible, but code outside of pandas could call us like that, so make some checks # - the new one, where each value is also in the categories array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in categories = self._validate_categories(categories) codes = _get_codes_for_values(values, categories) # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 if com.is_integer_dtype(values) and not com.is_integer_dtype(categories): warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) if com.is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) # if we got categories, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.categories = categories self.name = name
def factorize(self, sort=False, na_sentinel=-1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
def test_warn(self): s = Series([1, 2, 3]) with tm.assert_produces_warning(FutureWarning): algos.factorize(s, order='A')
def factorize(self, sort: bool = False, na_sentinel: int | None = -1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: # fast path self._codes = values self.name = name self.levels = levels self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if levels is None: levels = cat.levels if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the level # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: codes, levels = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, levels = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the usershould # give us one by specifying levels raise TypeError( "'values' is not ordered, please explicitly specify the level " "order by passing in a level argument.") else: # there are two ways if levels are present # the old one, where each value is a int pointer to the levels array # the new one, where each value is also in the level array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in levels = self._validate_levels(levels) # There can be two ways: the old which passed in codes and levels directly # and values have to be inferred and the new one, which passes in values and levels # and _codes have to be inferred. # min and max can be higher and lower if not all levels are in the values if compat and (com.is_integer_dtype(values) and (np.min(values) >= -1) and (np.max(values) < len(levels))): warn( "Using 'values' as codes is deprecated.\n" "'Categorical(... , compat=True)' is only there for historical reasons and " "should not be used in new code!\n" "See https://github.com/pydata/pandas/pull/7217", FutureWarning) codes = values else: codes = _get_codes_for_values(values, levels) # if we got levels, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.levels = levels self.name = name
def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: # fast path self._codes = values self.name = name self.levels = levels self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if levels is None: levels = cat.levels if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the level # assignment step. dtype = 'object' if com.isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: codes, levels = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, levels = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the usershould # give us one by specifying levels raise TypeError("'values' is not ordered, please explicitly specify the level " "order by passing in a level argument.") else: # there are two ways if levels are present # the old one, where each value is a int pointer to the levels array # the new one, where each value is also in the level array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in levels = self._validate_levels(levels) # There can be two ways: the old which passed in codes and levels directly # and values have to be inferred and the new one, which passes in values and levels # and _codes have to be inferred. # min and max can be higher and lower if not all levels are in the values if compat and (com.is_integer_dtype(values) and (np.min(values) >= -1) and (np.max(values) < len(levels))): warn("Using 'values' as codes is deprecated.\n" "'Categorical(... , compat=True)' is only there for historical reasons and " "should not be used in new code!\n" "See https://github.com/pydata/pandas/pull/7217", FutureWarning) codes = values else: codes = _get_codes_for_values(values, levels) # if we got levels, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.levels = levels self.name = name