Example #1
0
def guess_categorical(data):
    if safe_is_pandas_categorical(data):
        return True
    if isinstance(data, _CategoricalBox):
        return True
    data = np.asarray(data)
    if safe_issubdtype(data.dtype, np.number):
        return False
    return True
Example #2
0
def guess_categorical(data):
    if safe_is_pandas_categorical(data):
        return True
    if isinstance(data, _CategoricalBox):
        return True
    data = np.asarray(data)
    if safe_issubdtype(data.dtype, np.number):
        return False
    return True
Example #3
0
    def sniff(self, data):
        if hasattr(data, "contrast"):
            self._contrast = data.contrast
        # returns a bool: are we confident that we found all the levels?
        if isinstance(data, _CategoricalBox):
            if data.levels is not None:
                self._levels = tuple(data.levels)
                return True
            else:
                # unbox and fall through
                data = data.data
        if safe_is_pandas_categorical(data):
            # pandas.Categorical has its own NA detection, so don't try to
            # second-guess it.
            self._levels = tuple(pandas_Categorical_categories(data))
            return True
        # fastpath to avoid doing an item-by-item iteration over boolean
        # arrays, as requested by #44
        if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
            self._level_set = set([True, False])
            return True

        data = _categorical_shape_fix(data)

        for value in data:
            if self._NA_action.is_categorical_NA(value):
                continue
            if value is True or value is False:
                self._level_set.update([True, False])
            else:
                try:
                    self._level_set.add(value)
                except TypeError:
                    raise PatsyError("Error interpreting categorical data: "
                                     "all items must be hashable",
                                     self._origin)
        # If everything we've seen is boolean, assume that everything else
        # would be too. Otherwise we need to keep looking.
        return self._level_set == set([True, False])
Example #4
0
    def sniff(self, data):
        if hasattr(data, "contrast"):
            self._contrast = data.contrast
        # returns a bool: are we confident that we found all the levels?
        if isinstance(data, _CategoricalBox):
            if data.levels is not None:
                self._levels = tuple(data.levels)
                return True
            else:
                # unbox and fall through
                data = data.data
        if safe_is_pandas_categorical(data):
            # pandas.Categorical has its own NA detection, so don't try to
            # second-guess it.
            self._levels = tuple(pandas_Categorical_categories(data))
            return True
        # fastpath to avoid doing an item-by-item iteration over boolean
        # arrays, as requested by #44
        if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
            self._level_set = set([True, False])
            return True

        data = _categorical_shape_fix(data)

        for value in data:
            if self._NA_action.is_categorical_NA(value):
                continue
            if value is True or value is False:
                self._level_set.update([True, False])
            else:
                try:
                    self._level_set.add(value)
                except TypeError:
                    raise PatsyError(
                        "Error interpreting categorical data: "
                        "all items must be hashable", self._origin)
        # If everything we've seen is boolean, assume that everything else
        # would be too. Otherwise we need to keep looking.
        return self._level_set == set([True, False])
Example #5
0
def categorical_to_int(data, levels, NA_action, origin=None):
    assert isinstance(levels, tuple)
    # In this function, missing values are always mapped to -1

    if safe_is_pandas_categorical(data):
        data_levels_tuple = tuple(pandas_Categorical_categories(data))
        if not data_levels_tuple == levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, data_levels_tuple), origin)
        # pandas.Categorical also uses -1 to indicate NA, and we don't try to
        # second-guess its NA detection, so we can just pass it back.
        return pandas_Categorical_codes(data)

    if isinstance(data, _CategoricalBox):
        if data.levels is not None and tuple(data.levels) != levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, tuple(data.levels)), origin)
        data = data.data

    data = _categorical_shape_fix(data)

    try:
        level_to_int = dict(zip(levels, range(len(levels))))
    except TypeError:
        raise PatsyError("Error interpreting categorical data: "
                         "all items must be hashable", origin)

    # fastpath to avoid doing an item-by-item iteration over boolean arrays,
    # as requested by #44
    if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
        if level_to_int[False] == 0 and level_to_int[True] == 1:
            return data.astype(np.int_)
    out = np.empty(len(data), dtype=int)
    for i, value in enumerate(data):
        if NA_action.is_categorical_NA(value):
            out[i] = -1
        else:
            try:
                out[i] = level_to_int[value]
            except KeyError:
                SHOW_LEVELS = 4
                level_strs = []
                if len(levels) <= SHOW_LEVELS:
                    level_strs += [repr(level) for level in levels]
                else:
                    level_strs += [repr(level)
                                   for level in levels[:SHOW_LEVELS//2]]
                    level_strs.append("...")
                    level_strs += [repr(level)
                                   for level in levels[-SHOW_LEVELS//2:]]
                level_str = "[%s]" % (", ".join(level_strs))
                raise PatsyError("Error converting data to categorical: "
                                 "observation with value %r does not match "
                                 "any of the expected levels (expected: %s)"
                                 % (value, level_str), origin)
            except TypeError:
                raise PatsyError("Error converting data to categorical: "
                                 "encountered unhashable value %r"
                                 % (value,), origin)
    if have_pandas and isinstance(data, pandas.Series):
        out = pandas.Series(out, index=data.index)
    return out
Example #6
0
def categorical_to_int(data, levels, NA_action, origin=None):
    assert isinstance(levels, tuple)
    # In this function, missing values are always mapped to -1

    if safe_is_pandas_categorical(data):
        data_levels_tuple = tuple(pandas_Categorical_categories(data))
        if not data_levels_tuple == levels:
            raise PatsyError(
                "mismatching levels: expected %r, got %r" %
                (levels, data_levels_tuple), origin)
        # pandas.Categorical also uses -1 to indicate NA, and we don't try to
        # second-guess its NA detection, so we can just pass it back.
        return pandas_Categorical_codes(data)

    if isinstance(data, _CategoricalBox):
        if data.levels is not None and tuple(data.levels) != levels:
            raise PatsyError(
                "mismatching levels: expected %r, got %r" %
                (levels, tuple(data.levels)), origin)
        data = data.data

    data = _categorical_shape_fix(data)

    try:
        level_to_int = dict(zip(levels, range(len(levels))))
    except TypeError:
        raise PatsyError(
            "Error interpreting categorical data: "
            "all items must be hashable", origin)

    # fastpath to avoid doing an item-by-item iteration over boolean arrays,
    # as requested by #44
    if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
        if level_to_int[False] == 0 and level_to_int[True] == 1:
            return data.astype(np.int_)
    out = np.empty(len(data), dtype=int)
    for i, value in enumerate(data):
        if NA_action.is_categorical_NA(value):
            out[i] = -1
        else:
            try:
                out[i] = level_to_int[value]
            except KeyError:
                SHOW_LEVELS = 4
                level_strs = []
                if len(levels) <= SHOW_LEVELS:
                    level_strs += [repr(level) for level in levels]
                else:
                    level_strs += [
                        repr(level) for level in levels[:SHOW_LEVELS // 2]
                    ]
                    level_strs.append("...")
                    level_strs += [
                        repr(level) for level in levels[-SHOW_LEVELS // 2:]
                    ]
                level_str = "[%s]" % (", ".join(level_strs))
                raise PatsyError(
                    "Error converting data to categorical: "
                    "observation with value %r does not match "
                    "any of the expected levels (expected: %s)" %
                    (value, level_str), origin)
            except TypeError:
                raise PatsyError(
                    "Error converting data to categorical: "
                    "encountered unhashable value %r" % (value, ), origin)
    if have_pandas and isinstance(data, pandas.Series):
        out = pandas.Series(out, index=data.index)
    return out