コード例 #1
0
ファイル: build.py プロジェクト: grantnicholas/pytone
def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action):
    num_column_counts = {}
    cat_sniffers = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor], data)
            if factor in cat_sniffers or guess_categorical(value):
                if factor not in cat_sniffers:
                    cat_sniffers[factor] = CategoricalSniffer(NA_action,
                                                              factor.origin)
                done = cat_sniffers[factor].sniff(value)
                if done:
                    examine_needed.remove(factor)
            else:
                # Numeric
                value = atleast_2d_column_default(value)
                _max_allowed_dim(2, value, factor)
                column_count = value.shape[1]
                num_column_counts[factor] = column_count
                examine_needed.remove(factor)
        if not examine_needed:
            break
    # Pull out the levels
    cat_levels_contrasts = {}
    for factor, sniffer in six.iteritems(cat_sniffers):
        cat_levels_contrasts[factor] = sniffer.levels_contrast()
    return (num_column_counts, cat_levels_contrasts)
コード例 #2
0
ファイル: build.py プロジェクト: MarceloDL-A/metodos_python
def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action):
    num_column_counts = {}
    cat_sniffers = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor], data)
            if factor in cat_sniffers or guess_categorical(value):
                if factor not in cat_sniffers:
                    cat_sniffers[factor] = CategoricalSniffer(NA_action,
                                                              factor.origin)
                done = cat_sniffers[factor].sniff(value)
                if done:
                    examine_needed.remove(factor)
            else:
                # Numeric
                value = atleast_2d_column_default(value)
                _max_allowed_dim(2, value, factor)
                column_count = value.shape[1]
                num_column_counts[factor] = column_count
                examine_needed.remove(factor)
        if not examine_needed:
            break
    # Pull out the levels
    cat_levels_contrasts = {}
    for factor, sniffer in six.iteritems(cat_sniffers):
        cat_levels_contrasts[factor] = sniffer.levels_contrast()
    return (num_column_counts, cat_levels_contrasts)
コード例 #3
0
ファイル: design_info.py プロジェクト: chrish42/patsy
    def from_array(cls, array_like, default_column_prefix="column"):
        """Find or construct a DesignInfo appropriate for a given array_like.

        If the input `array_like` already has a ``.design_info``
        attribute, then it will be returned. Otherwise, a new DesignInfo
        object will be constructed, using names either taken from the
        `array_like` (e.g., for a pandas DataFrame with named columns), or
        constructed using `default_column_prefix`.

        This is how :func:`dmatrix` (for example) creates a DesignInfo object
        if an arbitrary matrix is passed in.

        :arg array_like: An ndarray or pandas container.
        :arg default_column_prefix: If it's necessary to invent column names,
          then this will be used to construct them.
        :returns: a DesignInfo object
        """
        if hasattr(array_like, "design_info") and isinstance(array_like.design_info, cls):
            return array_like.design_info
        arr = atleast_2d_column_default(array_like, preserve_pandas=True)
        if arr.ndim > 2:
            raise ValueError("design matrix can't have >2 dimensions")
        columns = getattr(arr, "columns", range(arr.shape[1]))
        if (hasattr(columns, "dtype")
            and not safe_issubdtype(columns.dtype, np.integer)):
            column_names = [str(obj) for obj in columns]
        else:
            column_names = ["%s%s" % (default_column_prefix, i)
                            for i in columns]
        return DesignInfo(column_names)
コード例 #4
0
ファイル: build.py プロジェクト: MarceloDL-A/metodos_python
def _eval_factor(factor_info, data, NA_action):
    factor = factor_info.factor
    result = factor.eval(factor_info.state, data)
    # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask
    if factor_info.type == "numerical":
        result = atleast_2d_column_default(result, preserve_pandas=True)
        _max_allowed_dim(2, result, factor)
        if result.shape[1] != factor_info.num_columns:
            raise PatsyError("when evaluating factor %s, I got %s columns "
                                "instead of the %s I was expecting"
                                % (factor.name(),
                                   factor_info.num_columns,
                                   result.shape[1]),
                                factor)
        if not safe_issubdtype(np.asarray(result).dtype, np.number):
            raise PatsyError("when evaluating numeric factor %s, "
                             "I got non-numeric data of type '%s'"
                             % (factor.name(), result.dtype),
                             factor)
        return result, NA_action.is_numerical_NA(result)
    # returns either a 1d ndarray or a pandas.Series, plus is_NA mask
    else:
        assert factor_info.type == "categorical"
        result = categorical_to_int(result, factor_info.categories, NA_action,
                                    origin=factor_info.factor)
        assert result.ndim == 1
        return result, np.asarray(result == -1)
コード例 #5
0
def assert_full_rank(m):
    m = atleast_2d_column_default(m)
    if m.shape[1] == 0:
        return True
    u, s, v = np.linalg.svd(m)
    rank = np.sum(s > 1e-10)
    assert rank == m.shape[1]
コード例 #6
0
ファイル: test_build.py プロジェクト: CaptainAL/Spyder
def assert_full_rank(m):
    m = atleast_2d_column_default(m)
    if m.shape[1] == 0:
        return True
    u, s, v = np.linalg.svd(m)
    rank = np.sum(s > 1e-10)
    assert rank == m.shape[1]
コード例 #7
0
ファイル: design_info.py プロジェクト: thorstenkranz/patsy
    def from_array(cls, array_like, default_column_prefix="column"):
        """Find or construct a DesignInfo appropriate for a given array_like.

        If the input `array_like` already has a ``.design_info``
        attribute, then it will be returned. Otherwise, a new DesignInfo
        object will be constructed, using names either taken from the
        `array_like` (e.g., for a pandas DataFrame with named columns), or
        constructed using `default_column_prefix`.

        This is how :func:`dmatrix` (for example) creates a DesignInfo object
        if an arbitrary matrix is passed in.

        :arg array_like: An ndarray or pandas container.
        :arg default_column_prefix: If it's necessary to invent column names,
          then this will be used to construct them.
        :returns: a DesignInfo object
        """
        if hasattr(array_like, "design_info") and isinstance(
                array_like.design_info, cls):
            return array_like.design_info
        arr = atleast_2d_column_default(array_like, preserve_pandas=True)
        if arr.ndim > 2:
            raise ValueError("design matrix can't have >2 dimensions")
        columns = getattr(arr, "columns", range(arr.shape[1]))
        if (isinstance(columns, np.ndarray)
                and not np.issubdtype(columns.dtype, np.integer)):
            column_names = [str(obj) for obj in columns]
        else:
            column_names = [
                "%s%s" % (default_column_prefix, i) for i in columns
            ]
        return DesignInfo(column_names)
コード例 #8
0
ファイル: build.py プロジェクト: guyrt/patsy
def _examine_factor_types(factors, factor_states, data_iter_maker):
    num_column_counts = {}
    cat_levels_contrasts = {}
    cat_postprocessors = {}
    prefinished_postprocessors = {}
    examine_needed = set(factors)
    for data in data_iter_maker():
        # We might have gathered all the information we need after the first
        # chunk of data. If so, then we shouldn't spend time loading all the
        # rest of the chunks.
        if not examine_needed:
            break
        for factor in list(examine_needed):
            value = factor.eval(factor_states[factor], data)
            if isinstance(value, Categorical):
                postprocessor = CategoricalTransform(levels=value.levels)
                prefinished_postprocessors[factor] = postprocessor
                cat_levels_contrasts[factor] = (value.levels,
                                                value.contrast)
                examine_needed.remove(factor)
                continue
            value = atleast_2d_column_default(value)
            _max_allowed_dim(2, value, factor)
            if np.issubdtype(value.dtype, np.number):
                column_count = value.shape[1]
                num_column_counts[factor] = column_count
                examine_needed.remove(factor)
            # issubdtype(X, bool) isn't reliable -- it returns true for
            # X == int! So check the kind code instead:
            elif value.dtype.kind == "b":
                # Special case: give it a transformer, but don't bother
                # processing the rest of the data
                if value.shape[1] > 1:
                    msg = ("factor '%s' evaluates to a boolean array with "
                           "%s columns; I can only handle single-column "
                           "boolean arrays" % (factor.name(), value.shape[1]))
                    raise PatsyError(msg, factor)
                cat_postprocessors[factor] = _BoolToCat(factor)
                examine_needed.remove(factor)
            else:
                if value.shape[1] > 1:
                    msg = ("factor '%s' appears to be categorical but has "
                           "%s columns; I can only handle single-column "
                           "categorical factors"
                           % (factor.name(), value.shape[1]))
                    raise PatsyError(msg, factor)
                if factor not in cat_postprocessors:
                    cat_postprocessors[factor] = CategoricalTransform()
                processor = cat_postprocessors[factor]
                processor.memorize_chunk(value)
    for factor, processor in cat_postprocessors.iteritems():
        processor.memorize_finish()
        cat_levels_contrasts[factor] = (processor.levels(), None)
    cat_postprocessors.update(prefinished_postprocessors)
    assert set(cat_postprocessors) == set(cat_levels_contrasts)
    return (num_column_counts,
            cat_levels_contrasts,
            cat_postprocessors)
コード例 #9
0
ファイル: state.py プロジェクト: anirban261491/IFT540Final
 def memorize_chunk(self, x):
     x = atleast_2d_column_default(x)
     self._count += x.shape[0]
     this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
     # This is to handle potentially multi-column x's:
     if self._sum is None:
         self._sum = this_total
     else:
         self._sum += this_total
コード例 #10
0
ファイル: state.py プロジェクト: Zaharid/patsy
 def memorize_chunk(self, x):
     x = atleast_2d_column_default(x)
     self._count += x.shape[0]
     this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
     # This is to handle potentially multi-column x's:
     if self._sum is None:
         self._sum = this_total
     else:
         self._sum += this_total
コード例 #11
0
ファイル: mgcv_cubic_splines.py プロジェクト: gyenney/Tools
    def transform(self, *args, **kwargs):
        args_2d = []
        for arg in args:
            arg = atleast_2d_column_default(arg)
            if arg.ndim != 2:
                raise ValueError("Each tensor product argument must be "
                                 "a 2-d array or 1-d vector.")
            args_2d.append(arg)

        return _get_te_dmatrix(args_2d, self._constraints)
コード例 #12
0
    def transform(self, *args, **kwargs):
        args_2d = []
        for arg in args:
            arg = atleast_2d_column_default(arg)
            if arg.ndim != 2:
                raise ValueError("Each tensor product argument must be "
                                 "a 2-d array or 1-d vector.")
            args_2d.append(arg)

        return _get_te_dmatrix(args_2d, self._constraints)
コード例 #13
0
ファイル: state.py プロジェクト: anirban261491/IFT540Final
 def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
     x = atleast_2d_column_default(x)
     if self.current_mean is None:
         self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
         self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
     # XX this can surely be vectorized but I am feeling lazy:
     for i in range(x.shape[0]):
         self.current_n += 1
         delta = x[i, :] - self.current_mean
         self.current_mean += delta / self.current_n
         self.current_M2 += delta * (x[i, :] - self.current_mean)
コード例 #14
0
ファイル: state.py プロジェクト: Zaharid/patsy
 def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
     x = atleast_2d_column_default(x)
     if self.current_mean is None:
         self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
         self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
     # XX this can surely be vectorized but I am feeling lazy:
     for i in xrange(x.shape[0]):
         self.current_n += 1
         delta = x[i, :] - self.current_mean
         self.current_mean += delta / self.current_n
         self.current_M2 += delta * (x[i, :] - self.current_mean)
コード例 #15
0
ファイル: state.py プロジェクト: Zaharid/patsy
 def transform(self, x, center=True, rescale=True, ddof=0):
     # XX: this forces all inputs to double-precision real, even if the
     # input is single- or extended-precision or complex. But I got all
     # tangled up in knots trying to do that without breaking something
     # else (e.g. by requiring an extra copy).
     x = asarray_or_pandas(x, copy=True, dtype=float)
     x_2d = atleast_2d_column_default(x, preserve_pandas=True)
     if center:
         x_2d -= self.current_mean
     if rescale:
         x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
     return pandas_friendly_reshape(x_2d, x.shape)
コード例 #16
0
ファイル: state.py プロジェクト: anirban261491/IFT540Final
 def transform(self, x, center=True, rescale=True, ddof=0):
     # XX: this forces all inputs to double-precision real, even if the
     # input is single- or extended-precision or complex. But I got all
     # tangled up in knots trying to do that without breaking something
     # else (e.g. by requiring an extra copy).
     x = asarray_or_pandas(x, copy=True, dtype=float)
     x_2d = atleast_2d_column_default(x, preserve_pandas=True)
     if center:
         x_2d -= self.current_mean
     if rescale:
         x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
     return pandas_friendly_reshape(x_2d, x.shape)
コード例 #17
0
ファイル: state.py プロジェクト: Zaharid/patsy
 def transform(self, x):
     x = asarray_or_pandas(x)
     # This doesn't copy data unless our input is a DataFrame that has
     # heterogenous types. And in that case we're going to be munging the
     # types anyway, so copying isn't a big deal.
     x_arr = np.asarray(x)
     if np.issubdtype(x_arr.dtype, np.integer):
         dt = float
     else:
         dt = x_arr.dtype
     mean_val = np.asarray(self._sum / self._count, dtype=dt)
     centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
     return pandas_friendly_reshape(centered, x.shape)
コード例 #18
0
 def transform(self, x):
     x = asarray_or_pandas(x)
     # This doesn't copy data unless our input is a DataFrame that has
     # heterogeneous types. And in that case we're going to be munging the
     # types anyway, so copying isn't a big deal.
     x_arr = np.asarray(x)
     if safe_issubdtype(x_arr.dtype, np.integer):
         dt = float
     else:
         dt = x_arr.dtype
     mean_val = np.asarray(self._sum / self._count, dtype=dt)
     centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
     return pandas_friendly_reshape(centered, x.shape)
コード例 #19
0
ファイル: highlevel.py プロジェクト: westurner/patsy
 def _regularize_matrix(m, default_column_prefix):
     di = DesignInfo.from_array(m, default_column_prefix)
     if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)):
         orig_index = m.index
     else:
         orig_index = None
     if return_type == "dataframe":
         m = atleast_2d_column_default(m, preserve_pandas=True)
         m = pandas.DataFrame(m)
         m.columns = di.column_names
         m.design_info = di
         return (m, orig_index)
     else:
         return (DesignMatrix(m, di), orig_index)
コード例 #20
0
ファイル: highlevel.py プロジェクト: danieltaborda/patsy
 def _regularize_matrix(m, default_column_prefix):
     di = DesignInfo.from_array(m, default_column_prefix)
     if have_pandas and isinstance(m,
                                   (pandas.Series, pandas.DataFrame)):
         orig_index = m.index
     else:
         orig_index = None
     if return_type == "dataframe":
         m = atleast_2d_column_default(m, preserve_pandas=True)
         m = pandas.DataFrame(m)
         m.columns = di.column_names
         m.design_info = di
         return (m, orig_index)
     else:
         return (DesignMatrix(m, di), orig_index)
コード例 #21
0
ファイル: constraint.py プロジェクト: pydata/patsy
 def __init__(self, variable_names, coefs, constants=None):
     self.variable_names = list(variable_names)
     self.coefs = np.atleast_2d(np.asarray(coefs, dtype=float))
     if constants is None:
         constants = np.zeros(self.coefs.shape[0], dtype=float)
     constants = np.asarray(constants, dtype=float)
     self.constants = atleast_2d_column_default(constants)
     if self.constants.ndim != 2 or self.constants.shape[1] != 1:
         raise ValueError("constants is not (convertible to) a column matrix")
     if self.coefs.ndim != 2 or self.coefs.shape[1] != len(variable_names):
         raise ValueError("wrong shape for coefs")
     if self.coefs.shape[0] == 0:
         raise ValueError("must have at least one row in constraint matrix")
     if self.coefs.shape[0] != self.constants.shape[0]:
         raise ValueError("shape mismatch between coefs and constants")
コード例 #22
0
ファイル: design_info.py プロジェクト: thorstenkranz/patsy
    def __new__(cls,
                input_array,
                design_info=None,
                default_column_prefix="column"):
        """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix.

        A call like::

          DesignMatrix(my_array)

        will convert an arbitrary array_like object into a DesignMatrix.

        The return from this function is guaranteed to be a two-dimensional
        ndarray with a real-valued floating point dtype, and a
        ``.design_info`` attribute which matches its shape. If the
        `design_info` argument is not given, then one is created via
        :meth:`DesignInfo.from_array` using the given
        `default_column_prefix`.

        Depending on the input array, it is possible this will pass through
        its input unchanged, or create a view.
        """
        # Pass through existing DesignMatrixes. The design_info check is
        # necessary because numpy is sort of annoying and cannot be stopped
        # from turning non-design-matrix arrays into DesignMatrix
        # instances. (E.g., my_dm.diagonal() will return a DesignMatrix
        # object, but one without a design_info attribute.)
        if (isinstance(input_array, DesignMatrix)
                and hasattr(input_array, "design_info")):
            return input_array
        self = atleast_2d_column_default(input_array).view(cls)
        # Upcast integer to floating point
        if np.issubdtype(self.dtype, np.integer):
            self = np.asarray(self, dtype=float).view(cls)
        if self.ndim > 2:
            raise ValueError("DesignMatrix must be 2d")
        assert self.ndim == 2
        if design_info is None:
            design_info = DesignInfo.from_array(self, default_column_prefix)
        if len(design_info.column_names) != self.shape[1]:
            raise ValueError("wrong number of column names for design matrix "
                             "(got %s, wanted %s)" %
                             (len(design_info.column_names), self.shape[1]))
        self.design_info = design_info
        if not np.issubdtype(self.dtype, np.floating):
            raise ValueError(
                "design matrix must be real-valued floating point")
        return self
コード例 #23
0
ファイル: build.py プロジェクト: joaonatali/patsy
 def eval(self, data, NA_action):
     result = self.factor.eval(self._state, data)
     result = atleast_2d_column_default(result, preserve_pandas=True)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise PatsyError(
             "when evaluating factor %s, I got %s columns "
             "instead of the %s I was expecting" %
             (self.factor.name(), self._expected_columns, result.shape[1]),
             self.factor)
     if not np.issubdtype(np.asarray(result).dtype, np.number):
         raise PatsyError(
             "when evaluating numeric factor %s, "
             "I got non-numeric data of type '%s'" %
             (self.factor.name(), result.dtype), self.factor)
     return result, NA_action.is_numerical_NA(result)
コード例 #24
0
ファイル: build.py プロジェクト: grantnicholas/pytone
 def eval(self, data, NA_action):
     result = self.factor.eval(self._state, data)
     result = atleast_2d_column_default(result, preserve_pandas=True)
     _max_allowed_dim(2, result, self.factor)
     if result.shape[1] != self._expected_columns:
         raise PatsyError("when evaluating factor %s, I got %s columns "
                             "instead of the %s I was expecting"
                             % (self.factor.name(), self._expected_columns,
                                result.shape[1]),
                             self.factor)
     if not np.issubdtype(np.asarray(result).dtype, np.number):
         raise PatsyError("when evaluating numeric factor %s, "
                             "I got non-numeric data of type '%s'"
                             % (self.factor.name(), result.dtype),
                             self.factor)
     return result, NA_action.is_numerical_NA(result)
コード例 #25
0
ファイル: constraint.py プロジェクト: pgajdos/patsy
 def __init__(self, variable_names, coefs, constants=None):
     self.variable_names = list(variable_names)
     self.coefs = np.atleast_2d(np.asarray(coefs, dtype=float))
     if constants is None:
         constants = np.zeros(self.coefs.shape[0], dtype=float)
     constants = np.asarray(constants, dtype=float)
     self.constants = atleast_2d_column_default(constants)
     if self.constants.ndim != 2 or self.constants.shape[1] != 1:
         raise ValueError(
             "constants is not (convertible to) a column matrix")
     if self.coefs.ndim != 2 or self.coefs.shape[1] != len(variable_names):
         raise ValueError("wrong shape for coefs")
     if self.coefs.shape[0] == 0:
         raise ValueError("must have at least one row in constraint matrix")
     if self.coefs.shape[0] != self.constants.shape[0]:
         raise ValueError("shape mismatch between coefs and constants")
コード例 #26
0
ファイル: build.py プロジェクト: Zaharid/patsy
def test__ColumnBuilder():
    from nose.tools import assert_raises
    from patsy.contrasts import ContrastMatrix
    from patsy.categorical import C

    f1 = _MockFactor("f1")
    f2 = _MockFactor("f2")
    f3 = _MockFactor("f3")
    contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"])

    cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast})
    mat = np.empty((3, 2))
    assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]
    cb.build(
        {
            f1: atleast_2d_column_default([1, 2, 3]),
            f2: np.asarray([0, 0, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12]),
        },
        mat,
    )
    assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]])
    # Check that missing categorical values blow up
    assert_raises(
        PatsyError,
        cb.build,
        {
            f1: atleast_2d_column_default([1, 2, 3]),
            f2: np.asarray([0, -1, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12]),
        },
        mat,
    )

    cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast})
    mat2 = np.empty((3, 4))
    cb2.build(
        {
            f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
            f2: np.asarray([0, 0, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12]),
        },
        mat2,
    )
    assert cb2.column_names() == ["f1[0]:f2[c1]:f3", "f1[1]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c2]:f3"]
    assert np.allclose(
        mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], [3 * 5 * -12, 3 * 6 * -12, 0, 0]]
    )
    # Check intercept building:
    cb_intercept = _ColumnBuilder([], {}, {})
    assert cb_intercept.column_names() == ["Intercept"]
    mat3 = np.empty((3, 1))
    cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3)
    assert np.allclose(mat3, 1)
コード例 #27
0
ファイル: design_info.py プロジェクト: chrish42/patsy
    def __new__(cls, input_array, design_info=None,
                default_column_prefix="column"):
        """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix.

        A call like::

          DesignMatrix(my_array)

        will convert an arbitrary array_like object into a DesignMatrix.

        The return from this function is guaranteed to be a two-dimensional
        ndarray with a real-valued floating point dtype, and a
        ``.design_info`` attribute which matches its shape. If the
        `design_info` argument is not given, then one is created via
        :meth:`DesignInfo.from_array` using the given
        `default_column_prefix`.

        Depending on the input array, it is possible this will pass through
        its input unchanged, or create a view.
        """
        # Pass through existing DesignMatrixes. The design_info check is
        # necessary because numpy is sort of annoying and cannot be stopped
        # from turning non-design-matrix arrays into DesignMatrix
        # instances. (E.g., my_dm.diagonal() will return a DesignMatrix
        # object, but one without a design_info attribute.)
        if (isinstance(input_array, DesignMatrix)
            and hasattr(input_array, "design_info")):
            return input_array
        self = atleast_2d_column_default(input_array).view(cls)
        # Upcast integer to floating point
        if safe_issubdtype(self.dtype, np.integer):
            self = np.asarray(self, dtype=float).view(cls)
        if self.ndim > 2:
            raise ValueError("DesignMatrix must be 2d")
        assert self.ndim == 2
        if design_info is None:
            design_info = DesignInfo.from_array(self, default_column_prefix)
        if len(design_info.column_names) != self.shape[1]:
            raise ValueError("wrong number of column names for design matrix "
                             "(got %s, wanted %s)"
                             % (len(design_info.column_names), self.shape[1]))
        self.design_info = design_info
        if not safe_issubdtype(self.dtype, np.floating):
            raise ValueError("design matrix must be real-valued floating point")
        return self
コード例 #28
0
    def memorize_chunk(self, *args, **kwargs):
        constraints = self._tmp.setdefault("constraints",
                                           kwargs.get("constraints"))
        if constraints == "center":
            args_2d = []
            for arg in args:
                arg = atleast_2d_column_default(arg)
                if arg.ndim != 2:
                    raise ValueError("Each tensor product argument must be "
                                     "a 2-d array or 1-d vector.")
                args_2d.append(arg)

            tp = _row_tensor_product(args_2d)
            self._tmp.setdefault("count", 0)
            self._tmp["count"] += tp.shape[0]

            chunk_sum = np.atleast_2d(tp.sum(axis=0))
            self._tmp.setdefault("sum", np.zeros(chunk_sum.shape))
            self._tmp["sum"] += chunk_sum
コード例 #29
0
ファイル: mgcv_cubic_splines.py プロジェクト: gyenney/Tools
    def memorize_chunk(self, *args, **kwargs):
        constraints = self._tmp.setdefault("constraints",
                                           kwargs.get("constraints"))
        if constraints == "center":
            args_2d = []
            for arg in args:
                arg = atleast_2d_column_default(arg)
                if arg.ndim != 2:
                    raise ValueError("Each tensor product argument must be "
                                     "a 2-d array or 1-d vector.")
                args_2d.append(arg)

            tp = _row_tensor_product(args_2d)
            self._tmp.setdefault("count", 0)
            self._tmp["count"] += tp.shape[0]

            chunk_sum = np.atleast_2d(tp.sum(axis=0))
            self._tmp.setdefault("sum", np.zeros(chunk_sum.shape))
            self._tmp["sum"] += chunk_sum
コード例 #30
0
ファイル: build.py プロジェクト: joaonatali/patsy
def test__ColumnBuilder():
    from nose.tools import assert_raises
    from patsy.contrasts import ContrastMatrix
    from patsy.categorical import C
    f1 = _MockFactor("f1")
    f2 = _MockFactor("f2")
    f3 = _MockFactor("f3")
    contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"])

    cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast})
    mat = np.empty((3, 2))
    assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]
    cb.build(
        {
            f1: atleast_2d_column_default([1, 2, 3]),
            f2: np.asarray([0, 0, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12])
        }, mat)
    assert np.allclose(
        mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]])
    # Check that missing categorical values blow up
    assert_raises(
        PatsyError, cb.build, {
            f1: atleast_2d_column_default([1, 2, 3]),
            f2: np.asarray([0, -1, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12])
        }, mat)

    cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast})
    mat2 = np.empty((3, 4))
    cb2.build(
        {
            f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
            f2: np.asarray([0, 0, 1]),
            f3: atleast_2d_column_default([7.5, 2, -12])
        }, mat2)
    assert cb2.column_names() == [
        "f1[0]:f2[c1]:f3", "f1[1]:f2[c1]:f3", "f1[0]:f2[c2]:f3",
        "f1[1]:f2[c2]:f3"
    ]
    assert np.allclose(
        mat2,
        [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5],
         [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], [3 * 5 * -12, 3 * 6 * -12, 0, 0]])
    # Check intercept building:
    cb_intercept = _ColumnBuilder([], {}, {})
    assert cb_intercept.column_names() == ["Intercept"]
    mat3 = np.empty((3, 1))
    cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3)
    assert np.allclose(mat3, 1)
コード例 #31
0
ファイル: build.py プロジェクト: guyrt/patsy
 def eval(self, data):
     # returns either a 2d ndarray or a DataFrame
     result = self.factor.eval(self._state, data)
     if self._postprocessor is not None:
         result = self._postprocessor.transform(result)
     if not isinstance(result, Categorical):
         msg = ("when evaluating categoric factor %r, I got a "
                "result that is not of type Categorical (but rather %s)"
                # result.__class__.__name__ would be better, but not
                # defined for old-style classes:
                % (self.factor.name(), result.__class__))
         raise PatsyError(msg, self.factor)
     if result.levels != self._expected_levels:
         msg = ("when evaluating categoric factor %r, I got Categorical "
                "data with unexpected levels (wanted %s, got %s)"
                % (self.factor.name(), self._expected_levels, result.levels))
         raise PatsyError(msg, self.factor)
     _max_allowed_dim(1, result.int_array, self.factor)
     # For consistency, evaluators *always* return 2d arrays (though in
     # this case it will always have only 1 column):
     return atleast_2d_column_default(result.int_array,
                                      preserve_pandas=True)
コード例 #32
0
ファイル: test_state.py プロジェクト: arnab0000/Internships
def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs):
    input = np.asarray(input)
    output = np.asarray(output)
    test_cases = [
        # List input, one chunk
        ([input], output),
        # Scalar input, many chunks
        (input, output),
        # List input, many chunks:
        ([[n] for n in input], output),
        # 0-d array input, many chunks:
        ([np.array(n) for n in input], output),
        # 1-d array input, one chunk:
        ([np.array(input)], output),
        # 1-d array input, many chunks:
        ([np.array([n]) for n in input], output),
        # 2-d but 1 column input, one chunk:
        ([np.array(input)[:, None]], atleast_2d_column_default(output)),
        # 2-d but 1 column input, many chunks:
        ([np.array([[n]]) for n in input], atleast_2d_column_default(output)),
        ]
    if accepts_multicolumn:
        # 2-d array input, one chunk:
        test_cases += [
            ([np.column_stack((input, input[::-1]))],
             np.column_stack((output, output[::-1]))),
            # 2-d array input, many chunks:
                ([np.array([[input[i], input[-i-1]]]) for i in range(len(input))],
                 np.column_stack((output, output[::-1]))),
            ]
    from patsy.util import have_pandas
    if have_pandas:
        import pandas
        pandas_type = (pandas.Series, pandas.DataFrame)
        pandas_index = np.linspace(0, 1, num=len(input))
        # 1d and 2d here refer to the dimensionality of the input
        if output.ndim == 1:
            output_1d = pandas.Series(output, index=pandas_index)
        else:
            output_1d = pandas.DataFrame(output, index=pandas_index)
        test_cases += [
            # Series input, one chunk
            ([pandas.Series(input, index=pandas_index)], output_1d),
            # Series input, many chunks
            ([pandas.Series([x], index=[idx])
              for (x, idx) in zip(input, pandas_index)],
             output_1d),
            ]
        if accepts_multicolumn:
            input_2d_2col = np.column_stack((input, input[::-1]))
            output_2d_2col = np.column_stack((output, output[::-1]))
            output_2col_dataframe = pandas.DataFrame(output_2d_2col,
                                                     index=pandas_index)
            test_cases += [
                # DataFrame input, one chunk
                ([pandas.DataFrame(input_2d_2col, index=pandas_index)],
                 output_2col_dataframe),
                # DataFrame input, many chunks
                ([pandas.DataFrame([input_2d_2col[i, :]],
                                   index=[pandas_index[i]])
                  for i in range(len(input))],
                 output_2col_dataframe),
            ]
    for input_obj, output_obj in test_cases:
        print(input_obj)
        t = cls()
        for input_chunk in input_obj:
            t.memorize_chunk(input_chunk, *args, **kwargs)
        t.memorize_finish()
        all_outputs = []
        for input_chunk in input_obj:
            output_chunk = t.transform(input_chunk, *args, **kwargs)
            if input.ndim == output.ndim:
                assert output_chunk.ndim == np.asarray(input_chunk).ndim
            all_outputs.append(output_chunk)
        if have_pandas and isinstance(all_outputs[0], pandas_type):
            all_output1 = pandas.concat(all_outputs)
            assert np.array_equal(all_output1.index, pandas_index)
        elif all_outputs[0].ndim == 0:
            all_output1 = np.array(all_outputs)
        elif all_outputs[0].ndim == 1:
            all_output1 = np.concatenate(all_outputs)
        else:
            all_output1 = np.row_stack(all_outputs)
        assert all_output1.shape[0] == len(input)
        # output_obj_reshaped = np.asarray(output_obj).reshape(all_output1.shape)
        # assert np.allclose(all_output1, output_obj_reshaped)
        assert np.allclose(all_output1, output_obj)
        if np.asarray(input_obj[0]).ndim == 0:
            all_input = np.array(input_obj)
        elif have_pandas and isinstance(input_obj[0], pandas_type):
            # handles both Series and DataFrames
            all_input = pandas.concat(input_obj)
        elif np.asarray(input_obj[0]).ndim == 1:
            # Don't use row_stack, because that would turn this into a 1xn
            # matrix:
            all_input = np.concatenate(input_obj)
        else:
            all_input = np.row_stack(input_obj)
        all_output2 = t.transform(all_input, *args, **kwargs)
        if have_pandas and isinstance(input_obj[0], pandas_type):
            assert np.array_equal(all_output2.index, pandas_index)
        if input.ndim == output.ndim:
            assert all_output2.ndim == all_input.ndim
        assert np.allclose(all_output2, output_obj)
コード例 #33
0
def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs):
    input = np.asarray(input)
    output = np.asarray(output)
    test_cases = [
        # List input, one chunk
        ([input], output),
        # Scalar input, many chunks
        (input, output),
        # List input, many chunks:
        ([[n] for n in input], output),
        # 0-d array input, many chunks:
        ([np.array(n) for n in input], output),
        # 1-d array input, one chunk:
        ([np.array(input)], output),
        # 1-d array input, many chunks:
        ([np.array([n]) for n in input], output),
        # 2-d but 1 column input, one chunk:
        ([np.array(input)[:, None]], atleast_2d_column_default(output)),
        # 2-d but 1 column input, many chunks:
        ([np.array([[n]]) for n in input], atleast_2d_column_default(output)),
        ]
    if accepts_multicolumn:
        # 2-d array input, one chunk:
        test_cases += [
            ([np.column_stack((input, input[::-1]))],
             np.column_stack((output, output[::-1]))),
            # 2-d array input, many chunks:
                ([np.array([[input[i], input[-i-1]]]) for i in range(len(input))],
                 np.column_stack((output, output[::-1]))),
            ]
    from patsy.util import have_pandas
    if have_pandas:
        import pandas
        pandas_type = (pandas.Series, pandas.DataFrame)
        pandas_index = np.linspace(0, 1, num=len(input))
        # 1d and 2d here refer to the dimensionality of the input
        if output.ndim == 1:
            output_1d = pandas.Series(output, index=pandas_index)
        else:
            output_1d = pandas.DataFrame(output, index=pandas_index)
        test_cases += [
            # Series input, one chunk
            ([pandas.Series(input, index=pandas_index)], output_1d),
            # Series input, many chunks
            ([pandas.Series([x], index=[idx])
              for (x, idx) in zip(input, pandas_index)],
             output_1d),
            ]
        if accepts_multicolumn:
            input_2d_2col = np.column_stack((input, input[::-1]))
            output_2d_2col = np.column_stack((output, output[::-1]))
            output_2col_dataframe = pandas.DataFrame(output_2d_2col,
                                                     index=pandas_index)
            test_cases += [
                # DataFrame input, one chunk
                ([pandas.DataFrame(input_2d_2col, index=pandas_index)],
                 output_2col_dataframe),
                # DataFrame input, many chunks
                ([pandas.DataFrame([input_2d_2col[i, :]],
                                   index=[pandas_index[i]])
                  for i in range(len(input))],
                 output_2col_dataframe),
            ]
    for input_obj, output_obj in test_cases:
        print(input_obj)
        t = cls()
        for input_chunk in input_obj:
            t.memorize_chunk(input_chunk, *args, **kwargs)
        t.memorize_finish()
        all_outputs = []
        for input_chunk in input_obj:
            output_chunk = t.transform(input_chunk, *args, **kwargs)
            if input.ndim == output.ndim:
                assert output_chunk.ndim == np.asarray(input_chunk).ndim
            all_outputs.append(output_chunk)
        if have_pandas and isinstance(all_outputs[0], pandas_type):
            all_output1 = pandas.concat(all_outputs)
            assert np.array_equal(all_output1.index, pandas_index)
        elif all_outputs[0].ndim == 0:
            all_output1 = np.array(all_outputs)
        elif all_outputs[0].ndim == 1:
            all_output1 = np.concatenate(all_outputs)
        else:
            all_output1 = np.row_stack(all_outputs)
        assert all_output1.shape[0] == len(input)
        # output_obj_reshaped = np.asarray(output_obj).reshape(all_output1.shape)
        # assert np.allclose(all_output1, output_obj_reshaped)
        assert np.allclose(all_output1, output_obj)
        if np.asarray(input_obj[0]).ndim == 0:
            all_input = np.array(input_obj)
        elif have_pandas and isinstance(input_obj[0], pandas_type):
            # handles both Series and DataFrames
            all_input = pandas.concat(input_obj)
        elif np.asarray(input_obj[0]).ndim == 1:
            # Don't use row_stack, because that would turn this into a 1xn
            # matrix:
            all_input = np.concatenate(input_obj)
        else:
            all_input = np.row_stack(input_obj)
        all_output2 = t.transform(all_input, *args, **kwargs)
        if have_pandas and isinstance(input_obj[0], pandas_type):
            assert np.array_equal(all_output2.index, pandas_index)
        if input.ndim == output.ndim:
            assert all_output2.ndim == all_input.ndim
        assert np.allclose(all_output2, output_obj)
コード例 #34
0
ファイル: build.py プロジェクト: MarceloDL-A/metodos_python
def test__subterm_column_names_iter_and__build_subterm():
    import pytest
    from patsy.contrasts import ContrastMatrix
    from patsy.categorical import C
    f1 = _MockFactor("f1")
    f2 = _MockFactor("f2")
    f3 = _MockFactor("f3")
    contrast = ContrastMatrix(np.array([[0, 0.5],
                                        [3, 0]]),
                              ["[c1]", "[c2]"])

    factor_infos1 = {f1: FactorInfo(f1, "numerical", {},
                                    num_columns=1, categories=None),
                     f2: FactorInfo(f2, "categorical", {},
                                    num_columns=None, categories=["a", "b"]),
                     f3: FactorInfo(f3, "numerical", {},
                                    num_columns=1, categories=None),
                     }
    contrast_matrices = {f2: contrast}
    subterm1 = SubtermInfo([f1, f2, f3], contrast_matrices, 2)
    assert (list(_subterm_column_names_iter(factor_infos1, subterm1))
            == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"])

    mat = np.empty((3, 2))
    _build_subterm(subterm1, factor_infos1,
                   {f1: atleast_2d_column_default([1, 2, 3]),
                    f2: np.asarray([0, 0, 1]),
                    f3: atleast_2d_column_default([7.5, 2, -12])},
                   mat)
    assert np.allclose(mat, [[0, 0.5 * 1 * 7.5],
                             [0, 0.5 * 2 * 2],
                             [3 * 3 * -12, 0]])
    # Check that missing categorical values blow up
    pytest.raises(PatsyError, _build_subterm, subterm1, factor_infos1,
                  {f1: atleast_2d_column_default([1, 2, 3]),
                   f2: np.asarray([0, -1, 1]),
                   f3: atleast_2d_column_default([7.5, 2, -12])},
                  mat)

    factor_infos2 = dict(factor_infos1)
    factor_infos2[f1] = FactorInfo(f1, "numerical", {},
                                   num_columns=2, categories=None)
    subterm2 = SubtermInfo([f1, f2, f3], contrast_matrices, 4)
    assert (list(_subterm_column_names_iter(factor_infos2, subterm2))
            == ["f1[0]:f2[c1]:f3",
                "f1[1]:f2[c1]:f3",
                "f1[0]:f2[c2]:f3",
                "f1[1]:f2[c2]:f3"])

    mat2 = np.empty((3, 4))
    _build_subterm(subterm2, factor_infos2,
                   {f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
                    f2: np.asarray([0, 0, 1]),
                    f3: atleast_2d_column_default([7.5, 2, -12])},
                   mat2)
    assert np.allclose(mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5],
                              [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2],
                              [3 * 5 * -12, 3 * 6 * -12, 0, 0]])


    subterm_int = SubtermInfo([], {}, 1)
    assert list(_subterm_column_names_iter({}, subterm_int)) == ["Intercept"]

    mat3 = np.empty((3, 1))
    _build_subterm(subterm_int, {},
                   {f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]},
                   mat3)
    assert np.allclose(mat3, 1)