def categorical_to_int(data, levels, NA_action, origin=None): assert isinstance(levels, tuple) # In this function, missing values are always mapped to -1 if isinstance(data, pd.Categorical): data_levels_tuple = tuple(data.levels) if not data_levels_tuple == levels: raise PatsyError("mismatching levels: expected %r, got %r" % (levels, data_levels_tuple), origin) # pd.Categorical also uses -1 to indicate NA, and we don't try to # second-guess its NA detection, so we can just pass it back. return data.labels elif hasattr(data, 'dtype') and hasattr(data, 'astype') and \ np.issubdtype(data.dtype, np.bool_): return data.astype('int') if isinstance(data, _CategoricalBox): if data.levels is not None and tuple(data.levels) != levels: raise PatsyError("mismatching levels: expected %r, got %r" % (levels, tuple(data.levels)), origin) data = data.data if hasattr(data, "shape") and len(data.shape) > 1: raise PatsyError("categorical data must be 1-dimensional", origin) if not iterable(data) or isinstance(data, basestring): raise PatsyError("categorical data must be an iterable container") try: level_to_int = dict(zip(levels, xrange(len(levels)))) except TypeError: raise PatsyError("Error interpreting categorical data: " "all items must be hashable", origin) out = np.empty(len(data), dtype=int) for i, value in enumerate(data): if NA_action.is_categorical_NA(value): out[i] = -1 else: try: out[i] = level_to_int[value] except KeyError: SHOW_LEVELS = 4 level_strs = [] if len(levels) <= SHOW_LEVELS: level_strs += [repr(level) for level in levels] else: level_strs += [repr(level) for level in levels[:SHOW_LEVELS//2]] level_strs.append("...") level_strs += [repr(level) for level in levels[-SHOW_LEVELS//2:]] level_str = "[%s]" % (", ".join(level_strs)) raise PatsyError("Error converting data to categorical: " "observation with value %r does not match " "any of the expected levels (expected: %s)" % (value, level_str), origin) except TypeError: raise PatsyError("Error converting data to categorical: " "encountered unhashable value %r" % (value,), origin) if isinstance(data, pd.Series): out = pd.Series(out, index=data.index) return out
def _categorical_shape_fix(data): # helper function # data should not be a _CategoricalBox or pandas Categorical or anything # -- it should be an actual iterable of data, but which might have the # wrong shape. if hasattr(data, "ndim") and data.ndim > 1: raise PatsyError("categorical data cannot be >1-dimensional") # coerce scalars into 1d, which is consistent with what we do for numeric # factors. (See statsmodels/statsmodels#1881) if (not iterable(data) or isinstance(data, (six.text_type, six.binary_type))): data = [data] return data