Example #1
0
    def __new__(cls,
                input_array,
                design_info=None,
                default_column_prefix="column"):
        """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix.

        A call like::

          DesignMatrix(my_array)

        will convert an arbitrary array_like object into a DesignMatrix.

        The return from this function is guaranteed to be a two-dimensional
        ndarray with a real-valued floating point dtype, and a
        ``.design_info`` attribute which matches its shape. If the
        `design_info` argument is not given, then one is created via
        :meth:`DesignInfo.from_array` using the given
        `default_column_prefix`.

        Depending on the input array, it is possible this will pass through
        its input unchanged, or create a view.
        """
        # Pass through existing DesignMatrixes. The design_info check is
        # necessary because numpy is sort of annoying and cannot be stopped
        # from turning non-design-matrix arrays into DesignMatrix
        # instances. (E.g., my_dm.diagonal() will return a DesignMatrix
        # object, but one without a design_info attribute.)
        if (isinstance(input_array, DesignMatrix)
                and hasattr(input_array, "design_info")):
            return input_array
        self = atleast_2d_column_default(input_array).view(cls)
        # Upcast integer to floating point
        if safe_issubdtype(self.dtype, np.integer):
            self = np.asarray(self, dtype=float).view(cls)
        if self.ndim > 2:
            raise ValueError("DesignMatrix must be 2d")
        assert self.ndim == 2
        if design_info is None:
            design_info = DesignInfo.from_array(self, default_column_prefix)
        if len(design_info.column_names) != self.shape[1]:
            raise ValueError("wrong number of column names for design matrix "
                             "(got %s, wanted %s)" %
                             (len(design_info.column_names), self.shape[1]))
        self.design_info = design_info
        if not safe_issubdtype(self.dtype, np.floating):
            raise ValueError(
                "design matrix must be real-valued floating point")
        return self
Example #2
0
def _eval_factor(factor_info, data, NA_action):
    factor = factor_info.factor
    result = factor.eval(factor_info.state, data)
    # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask
    if factor_info.type == "numerical":
        result = atleast_2d_column_default(result, preserve_pandas=True)
        _max_allowed_dim(2, result, factor)
        if result.shape[1] != factor_info.num_columns:
            raise PatsyError("when evaluating factor %s, I got %s columns "
                                "instead of the %s I was expecting"
                                % (factor.name(),
                                   factor_info.num_columns,
                                   result.shape[1]),
                                factor)
        if not safe_issubdtype(np.asarray(result).dtype, np.number):
            raise PatsyError("when evaluating numeric factor %s, "
                             "I got non-numeric data of type '%s'"
                             % (factor.name(), result.dtype),
                             factor)
        return result, NA_action.is_numerical_NA(result)
    # returns either a 1d ndarray or a pandas.Series, plus is_NA mask
    else:
        assert factor_info.type == "categorical"
        result = categorical_to_int(result, factor_info.categories, NA_action,
                                    origin=factor_info.factor)
        assert result.ndim == 1
        return result, np.asarray(result == -1)
Example #3
0
    def from_array(cls, array_like, default_column_prefix="column"):
        """Find or construct a DesignInfo appropriate for a given array_like.

        If the input `array_like` already has a ``.design_info``
        attribute, then it will be returned. Otherwise, a new DesignInfo
        object will be constructed, using names either taken from the
        `array_like` (e.g., for a pandas DataFrame with named columns), or
        constructed using `default_column_prefix`.

        This is how :func:`dmatrix` (for example) creates a DesignInfo object
        if an arbitrary matrix is passed in.

        :arg array_like: An ndarray or pandas container.
        :arg default_column_prefix: If it's necessary to invent column names,
          then this will be used to construct them.
        :returns: a DesignInfo object
        """
        if hasattr(array_like, "design_info") and isinstance(array_like.design_info, cls):
            return array_like.design_info
        arr = atleast_2d_column_default(array_like, preserve_pandas=True)
        if arr.ndim > 2:
            raise ValueError("design matrix can't have >2 dimensions")
        columns = getattr(arr, "columns", range(arr.shape[1]))
        if (hasattr(columns, "dtype")
            and not safe_issubdtype(columns.dtype, np.integer)):
            column_names = [str(obj) for obj in columns]
        else:
            column_names = ["%s%s" % (default_column_prefix, i)
                            for i in columns]
        return DesignInfo(column_names)
Example #4
0
    def from_array(cls, array_like, default_column_prefix="column"):
        """Find or construct a DesignInfo appropriate for a given array_like.

        If the input `array_like` already has a ``.design_info``
        attribute, then it will be returned. Otherwise, a new DesignInfo
        object will be constructed, using names either taken from the
        `array_like` (e.g., for a pandas DataFrame with named columns), or
        constructed using `default_column_prefix`.

        This is how :func:`dmatrix` (for example) creates a DesignInfo object
        if an arbitrary matrix is passed in.

        :arg array_like: An ndarray or pandas container.
        :arg default_column_prefix: If it's necessary to invent column names,
          then this will be used to construct them.
        :returns: a DesignInfo object
        """
        if hasattr(array_like, "design_info") and isinstance(array_like.design_info, cls):
            return array_like.design_info
        arr = atleast_2d_column_default(array_like, preserve_pandas=True)
        if arr.ndim > 2:
            raise ValueError("design matrix can't have >2 dimensions")
        columns = getattr(arr, "columns", range(arr.shape[1]))
        if (hasattr(columns, "dtype")
            and not safe_issubdtype(columns.dtype, np.integer)):
            column_names = [str(obj) for obj in columns]
        else:
            column_names = ["%s%s" % (default_column_prefix, i)
                            for i in columns]
        return DesignInfo(column_names)
Example #5
0
    def __new__(cls, input_array, design_info=None,
                default_column_prefix="column"):
        """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix.

        A call like::

          DesignMatrix(my_array)

        will convert an arbitrary array_like object into a DesignMatrix.

        The return from this function is guaranteed to be a two-dimensional
        ndarray with a real-valued floating point dtype, and a
        ``.design_info`` attribute which matches its shape. If the
        `design_info` argument is not given, then one is created via
        :meth:`DesignInfo.from_array` using the given
        `default_column_prefix`.

        Depending on the input array, it is possible this will pass through
        its input unchanged, or create a view.
        """
        # Pass through existing DesignMatrixes. The design_info check is
        # necessary because numpy is sort of annoying and cannot be stopped
        # from turning non-design-matrix arrays into DesignMatrix
        # instances. (E.g., my_dm.diagonal() will return a DesignMatrix
        # object, but one without a design_info attribute.)
        if (isinstance(input_array, DesignMatrix)
            and hasattr(input_array, "design_info")):
            return input_array
        self = atleast_2d_column_default(input_array).view(cls)
        # Upcast integer to floating point
        if safe_issubdtype(self.dtype, np.integer):
            self = np.asarray(self, dtype=float).view(cls)
        if self.ndim > 2:
            raise ValueError("DesignMatrix must be 2d")
        assert self.ndim == 2
        if design_info is None:
            design_info = DesignInfo.from_array(self, default_column_prefix)
        if len(design_info.column_names) != self.shape[1]:
            raise ValueError("wrong number of column names for design matrix "
                             "(got %s, wanted %s)"
                             % (len(design_info.column_names), self.shape[1]))
        self.design_info = design_info
        if not safe_issubdtype(self.dtype, np.floating):
            raise ValueError("design matrix must be real-valued floating point")
        return self
Example #6
0
def guess_categorical(data):
    if safe_is_pandas_categorical(data):
        return True
    if isinstance(data, _CategoricalBox):
        return True
    data = np.asarray(data)
    if safe_issubdtype(data.dtype, np.number):
        return False
    return True
Example #7
0
def guess_categorical(data):
    if safe_is_pandas_categorical(data):
        return True
    if isinstance(data, _CategoricalBox):
        return True
    data = np.asarray(data)
    if safe_issubdtype(data.dtype, np.number):
        return False
    return True
Example #8
0
 def transform(self, x):
     x = asarray_or_pandas(x)
     # This doesn't copy data unless our input is a DataFrame that has
     # heterogenous types. And in that case we're going to be munging the
     # types anyway, so copying isn't a big deal.
     x_arr = np.asarray(x)
     if safe_issubdtype(x_arr.dtype, np.integer):
         dt = float
     else:
         dt = x_arr.dtype
     mean_val = np.asarray(self._sum / self._count, dtype=dt)
     centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
     return pandas_friendly_reshape(centered, x.shape)
Example #9
0
 def transform(self, x):
     x = asarray_or_pandas(x)
     # This doesn't copy data unless our input is a DataFrame that has
     # heterogeneous types. And in that case we're going to be munging the
     # types anyway, so copying isn't a big deal.
     x_arr = np.asarray(x)
     if safe_issubdtype(x_arr.dtype, np.integer):
         dt = float
     else:
         dt = x_arr.dtype
     mean_val = np.asarray(self._sum / self._count, dtype=dt)
     centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
     return pandas_friendly_reshape(centered, x.shape)
Example #10
0
def code_contrast_matrix(intercept, levels, contrast, default=None):
    if contrast is None:
        contrast = default
    if callable(contrast):
        contrast = contrast()
    if isinstance(contrast, ContrastMatrix):
        return contrast
    as_array = np.asarray(contrast)
    if safe_issubdtype(as_array.dtype, np.number):
        return ContrastMatrix(as_array,
                              _name_levels("custom", range(as_array.shape[1])))
    if intercept:
        return contrast.code_with_intercept(levels)
    else:
        return contrast.code_without_intercept(levels)
def code_contrast_matrix(intercept, levels, contrast, default=None):
    if contrast is None:
        contrast = default
    if callable(contrast):
        contrast = contrast()
    if isinstance(contrast, ContrastMatrix):
        return contrast
    as_array = np.asarray(contrast)
    if safe_issubdtype(as_array.dtype, np.number):
        return ContrastMatrix(as_array,
                              _name_levels("custom", range(as_array.shape[1])))
    if intercept:
        return contrast.code_with_intercept(levels)
    else:
        return contrast.code_without_intercept(levels)
Example #12
0
 def eval(self, data, NA_action):
     result = self.factor.eval(self._state, data)
     result = atleast_2d_column_default(result, preserve_pandas=True)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise PatsyError("when evaluating factor %s, I got %s columns "
                             "instead of the %s I was expecting"
                             % (self.factor.name(), self._expected_columns,
                                result.shape[1]),
                             self.factor)
     if not safe_issubdtype(np.asarray(result).dtype, np.number):
         raise PatsyError("when evaluating numeric factor %s, "
                             "I got non-numeric data of type '%s'"
                             % (self.factor.name(), result.dtype),
                             self.factor)
     return result, NA_action.is_numerical_NA(result)
Example #13
0
    def sniff(self, data):
        if hasattr(data, "contrast"):
            self._contrast = data.contrast
        # returns a bool: are we confident that we found all the levels?
        if isinstance(data, _CategoricalBox):
            if data.levels is not None:
                self._levels = tuple(data.levels)
                return True
            else:
                # unbox and fall through
                data = data.data
        if safe_is_pandas_categorical(data):
            # pandas.Categorical has its own NA detection, so don't try to
            # second-guess it.
            self._levels = tuple(pandas_Categorical_categories(data))
            return True
        # fastpath to avoid doing an item-by-item iteration over boolean
        # arrays, as requested by #44
        if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
            self._level_set = set([True, False])
            return True

        data = _categorical_shape_fix(data)

        for value in data:
            if self._NA_action.is_categorical_NA(value):
                continue
            if value is True or value is False:
                self._level_set.update([True, False])
            else:
                try:
                    self._level_set.add(value)
                except TypeError:
                    raise PatsyError("Error interpreting categorical data: "
                                     "all items must be hashable",
                                     self._origin)
        # If everything we've seen is boolean, assume that everything else
        # would be too. Otherwise we need to keep looking.
        return self._level_set == set([True, False])
Example #14
0
    def sniff(self, data):
        if hasattr(data, "contrast"):
            self._contrast = data.contrast
        # returns a bool: are we confident that we found all the levels?
        if isinstance(data, _CategoricalBox):
            if data.levels is not None:
                self._levels = tuple(data.levels)
                return True
            else:
                # unbox and fall through
                data = data.data
        if safe_is_pandas_categorical(data):
            # pandas.Categorical has its own NA detection, so don't try to
            # second-guess it.
            self._levels = tuple(pandas_Categorical_categories(data))
            return True
        # fastpath to avoid doing an item-by-item iteration over boolean
        # arrays, as requested by #44
        if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
            self._level_set = set([True, False])
            return True

        data = _categorical_shape_fix(data)

        for value in data:
            if self._NA_action.is_categorical_NA(value):
                continue
            if value is True or value is False:
                self._level_set.update([True, False])
            else:
                try:
                    self._level_set.add(value)
                except TypeError:
                    raise PatsyError(
                        "Error interpreting categorical data: "
                        "all items must be hashable", self._origin)
        # If everything we've seen is boolean, assume that everything else
        # would be too. Otherwise we need to keep looking.
        return self._level_set == set([True, False])
Example #15
0
def categorical_to_int(data, levels, NA_action, origin=None):
    assert isinstance(levels, tuple)
    # In this function, missing values are always mapped to -1

    if safe_is_pandas_categorical(data):
        data_levels_tuple = tuple(pandas_Categorical_categories(data))
        if not data_levels_tuple == levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, data_levels_tuple), origin)
        # pandas.Categorical also uses -1 to indicate NA, and we don't try to
        # second-guess its NA detection, so we can just pass it back.
        return pandas_Categorical_codes(data)

    if isinstance(data, _CategoricalBox):
        if data.levels is not None and tuple(data.levels) != levels:
            raise PatsyError("mismatching levels: expected %r, got %r"
                             % (levels, tuple(data.levels)), origin)
        data = data.data

    data = _categorical_shape_fix(data)

    try:
        level_to_int = dict(zip(levels, range(len(levels))))
    except TypeError:
        raise PatsyError("Error interpreting categorical data: "
                         "all items must be hashable", origin)

    # fastpath to avoid doing an item-by-item iteration over boolean arrays,
    # as requested by #44
    if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
        if level_to_int[False] == 0 and level_to_int[True] == 1:
            return data.astype(np.int_)
    out = np.empty(len(data), dtype=int)
    for i, value in enumerate(data):
        if NA_action.is_categorical_NA(value):
            out[i] = -1
        else:
            try:
                out[i] = level_to_int[value]
            except KeyError:
                SHOW_LEVELS = 4
                level_strs = []
                if len(levels) <= SHOW_LEVELS:
                    level_strs += [repr(level) for level in levels]
                else:
                    level_strs += [repr(level)
                                   for level in levels[:SHOW_LEVELS//2]]
                    level_strs.append("...")
                    level_strs += [repr(level)
                                   for level in levels[-SHOW_LEVELS//2:]]
                level_str = "[%s]" % (", ".join(level_strs))
                raise PatsyError("Error converting data to categorical: "
                                 "observation with value %r does not match "
                                 "any of the expected levels (expected: %s)"
                                 % (value, level_str), origin)
            except TypeError:
                raise PatsyError("Error converting data to categorical: "
                                 "encountered unhashable value %r"
                                 % (value,), origin)
    if have_pandas and isinstance(data, pandas.Series):
        out = pandas.Series(out, index=data.index)
    return out
Example #16
0
def categorical_to_int(data, levels, NA_action, origin=None):
    assert isinstance(levels, tuple)
    # In this function, missing values are always mapped to -1

    if safe_is_pandas_categorical(data):
        data_levels_tuple = tuple(pandas_Categorical_categories(data))
        if not data_levels_tuple == levels:
            raise PatsyError(
                "mismatching levels: expected %r, got %r" %
                (levels, data_levels_tuple), origin)
        # pandas.Categorical also uses -1 to indicate NA, and we don't try to
        # second-guess its NA detection, so we can just pass it back.
        return pandas_Categorical_codes(data)

    if isinstance(data, _CategoricalBox):
        if data.levels is not None and tuple(data.levels) != levels:
            raise PatsyError(
                "mismatching levels: expected %r, got %r" %
                (levels, tuple(data.levels)), origin)
        data = data.data

    data = _categorical_shape_fix(data)

    try:
        level_to_int = dict(zip(levels, range(len(levels))))
    except TypeError:
        raise PatsyError(
            "Error interpreting categorical data: "
            "all items must be hashable", origin)

    # fastpath to avoid doing an item-by-item iteration over boolean arrays,
    # as requested by #44
    if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
        if level_to_int[False] == 0 and level_to_int[True] == 1:
            return data.astype(np.int_)
    out = np.empty(len(data), dtype=int)
    for i, value in enumerate(data):
        if NA_action.is_categorical_NA(value):
            out[i] = -1
        else:
            try:
                out[i] = level_to_int[value]
            except KeyError:
                SHOW_LEVELS = 4
                level_strs = []
                if len(levels) <= SHOW_LEVELS:
                    level_strs += [repr(level) for level in levels]
                else:
                    level_strs += [
                        repr(level) for level in levels[:SHOW_LEVELS // 2]
                    ]
                    level_strs.append("...")
                    level_strs += [
                        repr(level) for level in levels[-SHOW_LEVELS // 2:]
                    ]
                level_str = "[%s]" % (", ".join(level_strs))
                raise PatsyError(
                    "Error converting data to categorical: "
                    "observation with value %r does not match "
                    "any of the expected levels (expected: %s)" %
                    (value, level_str), origin)
            except TypeError:
                raise PatsyError(
                    "Error converting data to categorical: "
                    "encountered unhashable value %r" % (value, ), origin)
    if have_pandas and isinstance(data, pandas.Series):
        out = pandas.Series(out, index=data.index)
    return out