def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, levels=None): if fastpath: # fast path self._codes = values self.name = name self.categories = categories self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # TODO: Remove after deprecation period in 2017/ after 0.18 if not levels is None: warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", FutureWarning) if categories is None: categories = levels else: raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " "use only 'categories'") # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if categories is None: categories = cat.categories if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the category # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if categories is None: try: codes, categories = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, categories = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the user should # give us one by specifying categories raise TypeError("'values' is not ordered, please explicitly specify the " "categories order by passing in a categories argument.") else: # there were two ways if categories are present # - the old one, where each value is a int pointer to the levels array -> not anymore # possible, but code outside of pandas could call us like that, so make some checks # - the new one, where each value is also in the categories array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in categories = self._validate_categories(categories) codes = _get_codes_for_values(values, categories) # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 if com.is_integer_dtype(values) and not com.is_integer_dtype(categories): warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) if com.is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) # if we got categories, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.categories = categories self.name = name
def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: # fast path self._codes = values self.name = name self.levels = levels self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if levels is None: levels = cat.levels if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the level # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: codes, levels = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, levels = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the usershould # give us one by specifying levels raise TypeError( "'values' is not ordered, please explicitly specify the level " "order by passing in a level argument.") else: # there are two ways if levels are present # the old one, where each value is a int pointer to the levels array # the new one, where each value is also in the level array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in levels = self._validate_levels(levels) # There can be two ways: the old which passed in codes and levels directly # and values have to be inferred and the new one, which passes in values and levels # and _codes have to be inferred. # min and max can be higher and lower if not all levels are in the values if compat and (com.is_integer_dtype(values) and (np.min(values) >= -1) and (np.max(values) < len(levels))): warn( "Using 'values' as codes is deprecated.\n" "'Categorical(... , compat=True)' is only there for historical reasons and " "should not be used in new code!\n" "See https://github.com/pydata/pandas/pull/7217", FutureWarning) codes = values else: codes = _get_codes_for_values(values, levels) # if we got levels, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.levels = levels self.name = name
def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: # fast path self._codes = values self.name = name self.levels = levels self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if levels is None: levels = cat.levels if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the level # assignment step. dtype = 'object' if com.isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: codes, levels = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, levels = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the usershould # give us one by specifying levels raise TypeError("'values' is not ordered, please explicitly specify the level " "order by passing in a level argument.") else: # there are two ways if levels are present # the old one, where each value is a int pointer to the levels array # the new one, where each value is also in the level array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in levels = self._validate_levels(levels) # There can be two ways: the old which passed in codes and levels directly # and values have to be inferred and the new one, which passes in values and levels # and _codes have to be inferred. # min and max can be higher and lower if not all levels are in the values if compat and (com.is_integer_dtype(values) and (np.min(values) >= -1) and (np.max(values) < len(levels))): warn("Using 'values' as codes is deprecated.\n" "'Categorical(... , compat=True)' is only there for historical reasons and " "should not be used in new code!\n" "See https://github.com/pydata/pandas/pull/7217", FutureWarning) codes = values else: codes = _get_codes_for_values(values, levels) # if we got levels, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.levels = levels self.name = name