def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): num_column_counts = {} cat_sniffers = {} examine_needed = set(factors) for data in data_iter_maker(): for factor in list(examine_needed): value = factor.eval(factor_states[factor], data) if factor in cat_sniffers or guess_categorical(value): if factor not in cat_sniffers: cat_sniffers[factor] = CategoricalSniffer(NA_action, factor.origin) done = cat_sniffers[factor].sniff(value) if done: examine_needed.remove(factor) else: # Numeric value = atleast_2d_column_default(value) _max_allowed_dim(2, value, factor) column_count = value.shape[1] num_column_counts[factor] = column_count examine_needed.remove(factor) if not examine_needed: break # Pull out the levels cat_levels_contrasts = {} for factor, sniffer in six.iteritems(cat_sniffers): cat_levels_contrasts[factor] = sniffer.levels_contrast() return (num_column_counts, cat_levels_contrasts)
def from_array(cls, array_like, default_column_prefix="column"): """Find or construct a DesignInfo appropriate for a given array_like. If the input `array_like` already has a ``.design_info`` attribute, then it will be returned. Otherwise, a new DesignInfo object will be constructed, using names either taken from the `array_like` (e.g., for a pandas DataFrame with named columns), or constructed using `default_column_prefix`. This is how :func:`dmatrix` (for example) creates a DesignInfo object if an arbitrary matrix is passed in. :arg array_like: An ndarray or pandas container. :arg default_column_prefix: If it's necessary to invent column names, then this will be used to construct them. :returns: a DesignInfo object """ if hasattr(array_like, "design_info") and isinstance(array_like.design_info, cls): return array_like.design_info arr = atleast_2d_column_default(array_like, preserve_pandas=True) if arr.ndim > 2: raise ValueError("design matrix can't have >2 dimensions") columns = getattr(arr, "columns", range(arr.shape[1])) if (hasattr(columns, "dtype") and not safe_issubdtype(columns.dtype, np.integer)): column_names = [str(obj) for obj in columns] else: column_names = ["%s%s" % (default_column_prefix, i) for i in columns] return DesignInfo(column_names)
def _eval_factor(factor_info, data, NA_action): factor = factor_info.factor result = factor.eval(factor_info.state, data) # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask if factor_info.type == "numerical": result = atleast_2d_column_default(result, preserve_pandas=True) _max_allowed_dim(2, result, factor) if result.shape[1] != factor_info.num_columns: raise PatsyError("when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (factor.name(), factor_info.num_columns, result.shape[1]), factor) if not safe_issubdtype(np.asarray(result).dtype, np.number): raise PatsyError("when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (factor.name(), result.dtype), factor) return result, NA_action.is_numerical_NA(result) # returns either a 1d ndarray or a pandas.Series, plus is_NA mask else: assert factor_info.type == "categorical" result = categorical_to_int(result, factor_info.categories, NA_action, origin=factor_info.factor) assert result.ndim == 1 return result, np.asarray(result == -1)
def assert_full_rank(m): m = atleast_2d_column_default(m) if m.shape[1] == 0: return True u, s, v = np.linalg.svd(m) rank = np.sum(s > 1e-10) assert rank == m.shape[1]
def from_array(cls, array_like, default_column_prefix="column"): """Find or construct a DesignInfo appropriate for a given array_like. If the input `array_like` already has a ``.design_info`` attribute, then it will be returned. Otherwise, a new DesignInfo object will be constructed, using names either taken from the `array_like` (e.g., for a pandas DataFrame with named columns), or constructed using `default_column_prefix`. This is how :func:`dmatrix` (for example) creates a DesignInfo object if an arbitrary matrix is passed in. :arg array_like: An ndarray or pandas container. :arg default_column_prefix: If it's necessary to invent column names, then this will be used to construct them. :returns: a DesignInfo object """ if hasattr(array_like, "design_info") and isinstance( array_like.design_info, cls): return array_like.design_info arr = atleast_2d_column_default(array_like, preserve_pandas=True) if arr.ndim > 2: raise ValueError("design matrix can't have >2 dimensions") columns = getattr(arr, "columns", range(arr.shape[1])) if (isinstance(columns, np.ndarray) and not np.issubdtype(columns.dtype, np.integer)): column_names = [str(obj) for obj in columns] else: column_names = [ "%s%s" % (default_column_prefix, i) for i in columns ] return DesignInfo(column_names)
def _examine_factor_types(factors, factor_states, data_iter_maker): num_column_counts = {} cat_levels_contrasts = {} cat_postprocessors = {} prefinished_postprocessors = {} examine_needed = set(factors) for data in data_iter_maker(): # We might have gathered all the information we need after the first # chunk of data. If so, then we shouldn't spend time loading all the # rest of the chunks. if not examine_needed: break for factor in list(examine_needed): value = factor.eval(factor_states[factor], data) if isinstance(value, Categorical): postprocessor = CategoricalTransform(levels=value.levels) prefinished_postprocessors[factor] = postprocessor cat_levels_contrasts[factor] = (value.levels, value.contrast) examine_needed.remove(factor) continue value = atleast_2d_column_default(value) _max_allowed_dim(2, value, factor) if np.issubdtype(value.dtype, np.number): column_count = value.shape[1] num_column_counts[factor] = column_count examine_needed.remove(factor) # issubdtype(X, bool) isn't reliable -- it returns true for # X == int! So check the kind code instead: elif value.dtype.kind == "b": # Special case: give it a transformer, but don't bother # processing the rest of the data if value.shape[1] > 1: msg = ("factor '%s' evaluates to a boolean array with " "%s columns; I can only handle single-column " "boolean arrays" % (factor.name(), value.shape[1])) raise PatsyError(msg, factor) cat_postprocessors[factor] = _BoolToCat(factor) examine_needed.remove(factor) else: if value.shape[1] > 1: msg = ("factor '%s' appears to be categorical but has " "%s columns; I can only handle single-column " "categorical factors" % (factor.name(), value.shape[1])) raise PatsyError(msg, factor) if factor not in cat_postprocessors: cat_postprocessors[factor] = CategoricalTransform() processor = cat_postprocessors[factor] processor.memorize_chunk(value) for factor, processor in cat_postprocessors.iteritems(): processor.memorize_finish() cat_levels_contrasts[factor] = (processor.levels(), None) cat_postprocessors.update(prefinished_postprocessors) assert set(cat_postprocessors) == set(cat_levels_contrasts) return (num_column_counts, cat_levels_contrasts, cat_postprocessors)
def memorize_chunk(self, x): x = atleast_2d_column_default(x) self._count += x.shape[0] this_total = np.sum(x, 0, dtype=wide_dtype_for(x)) # This is to handle potentially multi-column x's: if self._sum is None: self._sum = this_total else: self._sum += this_total
def transform(self, *args, **kwargs): args_2d = [] for arg in args: arg = atleast_2d_column_default(arg) if arg.ndim != 2: raise ValueError("Each tensor product argument must be " "a 2-d array or 1-d vector.") args_2d.append(arg) return _get_te_dmatrix(args_2d, self._constraints)
def memorize_chunk(self, x, center=True, rescale=True, ddof=0): x = atleast_2d_column_default(x) if self.current_mean is None: self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) # XX this can surely be vectorized but I am feeling lazy: for i in range(x.shape[0]): self.current_n += 1 delta = x[i, :] - self.current_mean self.current_mean += delta / self.current_n self.current_M2 += delta * (x[i, :] - self.current_mean)
def memorize_chunk(self, x, center=True, rescale=True, ddof=0): x = atleast_2d_column_default(x) if self.current_mean is None: self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x)) # XX this can surely be vectorized but I am feeling lazy: for i in xrange(x.shape[0]): self.current_n += 1 delta = x[i, :] - self.current_mean self.current_mean += delta / self.current_n self.current_M2 += delta * (x[i, :] - self.current_mean)
def transform(self, x, center=True, rescale=True, ddof=0): # XX: this forces all inputs to double-precision real, even if the # input is single- or extended-precision or complex. But I got all # tangled up in knots trying to do that without breaking something # else (e.g. by requiring an extra copy). x = asarray_or_pandas(x, copy=True, dtype=float) x_2d = atleast_2d_column_default(x, preserve_pandas=True) if center: x_2d -= self.current_mean if rescale: x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof)) return pandas_friendly_reshape(x_2d, x.shape)
def transform(self, x): x = asarray_or_pandas(x) # This doesn't copy data unless our input is a DataFrame that has # heterogenous types. And in that case we're going to be munging the # types anyway, so copying isn't a big deal. x_arr = np.asarray(x) if np.issubdtype(x_arr.dtype, np.integer): dt = float else: dt = x_arr.dtype mean_val = np.asarray(self._sum / self._count, dtype=dt) centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val return pandas_friendly_reshape(centered, x.shape)
def transform(self, x): x = asarray_or_pandas(x) # This doesn't copy data unless our input is a DataFrame that has # heterogeneous types. And in that case we're going to be munging the # types anyway, so copying isn't a big deal. x_arr = np.asarray(x) if safe_issubdtype(x_arr.dtype, np.integer): dt = float else: dt = x_arr.dtype mean_val = np.asarray(self._sum / self._count, dtype=dt) centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val return pandas_friendly_reshape(centered, x.shape)
def _regularize_matrix(m, default_column_prefix): di = DesignInfo.from_array(m, default_column_prefix) if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)): orig_index = m.index else: orig_index = None if return_type == "dataframe": m = atleast_2d_column_default(m, preserve_pandas=True) m = pandas.DataFrame(m) m.columns = di.column_names m.design_info = di return (m, orig_index) else: return (DesignMatrix(m, di), orig_index)
def __init__(self, variable_names, coefs, constants=None): self.variable_names = list(variable_names) self.coefs = np.atleast_2d(np.asarray(coefs, dtype=float)) if constants is None: constants = np.zeros(self.coefs.shape[0], dtype=float) constants = np.asarray(constants, dtype=float) self.constants = atleast_2d_column_default(constants) if self.constants.ndim != 2 or self.constants.shape[1] != 1: raise ValueError("constants is not (convertible to) a column matrix") if self.coefs.ndim != 2 or self.coefs.shape[1] != len(variable_names): raise ValueError("wrong shape for coefs") if self.coefs.shape[0] == 0: raise ValueError("must have at least one row in constraint matrix") if self.coefs.shape[0] != self.constants.shape[0]: raise ValueError("shape mismatch between coefs and constants")
def __new__(cls, input_array, design_info=None, default_column_prefix="column"): """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix. A call like:: DesignMatrix(my_array) will convert an arbitrary array_like object into a DesignMatrix. The return from this function is guaranteed to be a two-dimensional ndarray with a real-valued floating point dtype, and a ``.design_info`` attribute which matches its shape. If the `design_info` argument is not given, then one is created via :meth:`DesignInfo.from_array` using the given `default_column_prefix`. Depending on the input array, it is possible this will pass through its input unchanged, or create a view. """ # Pass through existing DesignMatrixes. The design_info check is # necessary because numpy is sort of annoying and cannot be stopped # from turning non-design-matrix arrays into DesignMatrix # instances. (E.g., my_dm.diagonal() will return a DesignMatrix # object, but one without a design_info attribute.) if (isinstance(input_array, DesignMatrix) and hasattr(input_array, "design_info")): return input_array self = atleast_2d_column_default(input_array).view(cls) # Upcast integer to floating point if np.issubdtype(self.dtype, np.integer): self = np.asarray(self, dtype=float).view(cls) if self.ndim > 2: raise ValueError("DesignMatrix must be 2d") assert self.ndim == 2 if design_info is None: design_info = DesignInfo.from_array(self, default_column_prefix) if len(design_info.column_names) != self.shape[1]: raise ValueError("wrong number of column names for design matrix " "(got %s, wanted %s)" % (len(design_info.column_names), self.shape[1])) self.design_info = design_info if not np.issubdtype(self.dtype, np.floating): raise ValueError( "design matrix must be real-valued floating point") return self
def eval(self, data, NA_action): result = self.factor.eval(self._state, data) result = atleast_2d_column_default(result, preserve_pandas=True) _max_allowed_dim(2, result, self.factor) if result.shape[1] != self._expected_columns: raise PatsyError( "when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (self.factor.name(), self._expected_columns, result.shape[1]), self.factor) if not np.issubdtype(np.asarray(result).dtype, np.number): raise PatsyError( "when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (self.factor.name(), result.dtype), self.factor) return result, NA_action.is_numerical_NA(result)
def eval(self, data, NA_action): result = self.factor.eval(self._state, data) result = atleast_2d_column_default(result, preserve_pandas=True) _max_allowed_dim(2, result, self.factor) if result.shape[1] != self._expected_columns: raise PatsyError("when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (self.factor.name(), self._expected_columns, result.shape[1]), self.factor) if not np.issubdtype(np.asarray(result).dtype, np.number): raise PatsyError("when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (self.factor.name(), result.dtype), self.factor) return result, NA_action.is_numerical_NA(result)
def __init__(self, variable_names, coefs, constants=None): self.variable_names = list(variable_names) self.coefs = np.atleast_2d(np.asarray(coefs, dtype=float)) if constants is None: constants = np.zeros(self.coefs.shape[0], dtype=float) constants = np.asarray(constants, dtype=float) self.constants = atleast_2d_column_default(constants) if self.constants.ndim != 2 or self.constants.shape[1] != 1: raise ValueError( "constants is not (convertible to) a column matrix") if self.coefs.ndim != 2 or self.coefs.shape[1] != len(variable_names): raise ValueError("wrong shape for coefs") if self.coefs.shape[0] == 0: raise ValueError("must have at least one row in constraint matrix") if self.coefs.shape[0] != self.constants.shape[0]: raise ValueError("shape mismatch between coefs and constants")
def test__ColumnBuilder(): from nose.tools import assert_raises from patsy.contrasts import ContrastMatrix from patsy.categorical import C f1 = _MockFactor("f1") f2 = _MockFactor("f2") f3 = _MockFactor("f3") contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"]) cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast}) mat = np.empty((3, 2)) assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"] cb.build( { f1: atleast_2d_column_default([1, 2, 3]), f2: np.asarray([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12]), }, mat, ) assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]]) # Check that missing categorical values blow up assert_raises( PatsyError, cb.build, { f1: atleast_2d_column_default([1, 2, 3]), f2: np.asarray([0, -1, 1]), f3: atleast_2d_column_default([7.5, 2, -12]), }, mat, ) cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast}) mat2 = np.empty((3, 4)) cb2.build( { f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), f2: np.asarray([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12]), }, mat2, ) assert cb2.column_names() == ["f1[0]:f2[c1]:f3", "f1[1]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c2]:f3"] assert np.allclose( mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], [3 * 5 * -12, 3 * 6 * -12, 0, 0]] ) # Check intercept building: cb_intercept = _ColumnBuilder([], {}, {}) assert cb_intercept.column_names() == ["Intercept"] mat3 = np.empty((3, 1)) cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3) assert np.allclose(mat3, 1)
def __new__(cls, input_array, design_info=None, default_column_prefix="column"): """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix. A call like:: DesignMatrix(my_array) will convert an arbitrary array_like object into a DesignMatrix. The return from this function is guaranteed to be a two-dimensional ndarray with a real-valued floating point dtype, and a ``.design_info`` attribute which matches its shape. If the `design_info` argument is not given, then one is created via :meth:`DesignInfo.from_array` using the given `default_column_prefix`. Depending on the input array, it is possible this will pass through its input unchanged, or create a view. """ # Pass through existing DesignMatrixes. The design_info check is # necessary because numpy is sort of annoying and cannot be stopped # from turning non-design-matrix arrays into DesignMatrix # instances. (E.g., my_dm.diagonal() will return a DesignMatrix # object, but one without a design_info attribute.) if (isinstance(input_array, DesignMatrix) and hasattr(input_array, "design_info")): return input_array self = atleast_2d_column_default(input_array).view(cls) # Upcast integer to floating point if safe_issubdtype(self.dtype, np.integer): self = np.asarray(self, dtype=float).view(cls) if self.ndim > 2: raise ValueError("DesignMatrix must be 2d") assert self.ndim == 2 if design_info is None: design_info = DesignInfo.from_array(self, default_column_prefix) if len(design_info.column_names) != self.shape[1]: raise ValueError("wrong number of column names for design matrix " "(got %s, wanted %s)" % (len(design_info.column_names), self.shape[1])) self.design_info = design_info if not safe_issubdtype(self.dtype, np.floating): raise ValueError("design matrix must be real-valued floating point") return self
def memorize_chunk(self, *args, **kwargs): constraints = self._tmp.setdefault("constraints", kwargs.get("constraints")) if constraints == "center": args_2d = [] for arg in args: arg = atleast_2d_column_default(arg) if arg.ndim != 2: raise ValueError("Each tensor product argument must be " "a 2-d array or 1-d vector.") args_2d.append(arg) tp = _row_tensor_product(args_2d) self._tmp.setdefault("count", 0) self._tmp["count"] += tp.shape[0] chunk_sum = np.atleast_2d(tp.sum(axis=0)) self._tmp.setdefault("sum", np.zeros(chunk_sum.shape)) self._tmp["sum"] += chunk_sum
def test__ColumnBuilder(): from nose.tools import assert_raises from patsy.contrasts import ContrastMatrix from patsy.categorical import C f1 = _MockFactor("f1") f2 = _MockFactor("f2") f3 = _MockFactor("f3") contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"]) cb = _ColumnBuilder([f1, f2, f3], {f1: 1, f3: 1}, {f2: contrast}) mat = np.empty((3, 2)) assert cb.column_names() == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"] cb.build( { f1: atleast_2d_column_default([1, 2, 3]), f2: np.asarray([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12]) }, mat) assert np.allclose( mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]]) # Check that missing categorical values blow up assert_raises( PatsyError, cb.build, { f1: atleast_2d_column_default([1, 2, 3]), f2: np.asarray([0, -1, 1]), f3: atleast_2d_column_default([7.5, 2, -12]) }, mat) cb2 = _ColumnBuilder([f1, f2, f3], {f1: 2, f3: 1}, {f2: contrast}) mat2 = np.empty((3, 4)) cb2.build( { f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), f2: np.asarray([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12]) }, mat2) assert cb2.column_names() == [ "f1[0]:f2[c1]:f3", "f1[1]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c2]:f3" ] assert np.allclose( mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], [3 * 5 * -12, 3 * 6 * -12, 0, 0]]) # Check intercept building: cb_intercept = _ColumnBuilder([], {}, {}) assert cb_intercept.column_names() == ["Intercept"] mat3 = np.empty((3, 1)) cb_intercept.build({f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3) assert np.allclose(mat3, 1)
def eval(self, data): # returns either a 2d ndarray or a DataFrame result = self.factor.eval(self._state, data) if self._postprocessor is not None: result = self._postprocessor.transform(result) if not isinstance(result, Categorical): msg = ("when evaluating categoric factor %r, I got a " "result that is not of type Categorical (but rather %s)" # result.__class__.__name__ would be better, but not # defined for old-style classes: % (self.factor.name(), result.__class__)) raise PatsyError(msg, self.factor) if result.levels != self._expected_levels: msg = ("when evaluating categoric factor %r, I got Categorical " "data with unexpected levels (wanted %s, got %s)" % (self.factor.name(), self._expected_levels, result.levels)) raise PatsyError(msg, self.factor) _max_allowed_dim(1, result.int_array, self.factor) # For consistency, evaluators *always* return 2d arrays (though in # this case it will always have only 1 column): return atleast_2d_column_default(result.int_array, preserve_pandas=True)
def check_stateful(cls, accepts_multicolumn, input, output, *args, **kwargs): input = np.asarray(input) output = np.asarray(output) test_cases = [ # List input, one chunk ([input], output), # Scalar input, many chunks (input, output), # List input, many chunks: ([[n] for n in input], output), # 0-d array input, many chunks: ([np.array(n) for n in input], output), # 1-d array input, one chunk: ([np.array(input)], output), # 1-d array input, many chunks: ([np.array([n]) for n in input], output), # 2-d but 1 column input, one chunk: ([np.array(input)[:, None]], atleast_2d_column_default(output)), # 2-d but 1 column input, many chunks: ([np.array([[n]]) for n in input], atleast_2d_column_default(output)), ] if accepts_multicolumn: # 2-d array input, one chunk: test_cases += [ ([np.column_stack((input, input[::-1]))], np.column_stack((output, output[::-1]))), # 2-d array input, many chunks: ([np.array([[input[i], input[-i-1]]]) for i in range(len(input))], np.column_stack((output, output[::-1]))), ] from patsy.util import have_pandas if have_pandas: import pandas pandas_type = (pandas.Series, pandas.DataFrame) pandas_index = np.linspace(0, 1, num=len(input)) # 1d and 2d here refer to the dimensionality of the input if output.ndim == 1: output_1d = pandas.Series(output, index=pandas_index) else: output_1d = pandas.DataFrame(output, index=pandas_index) test_cases += [ # Series input, one chunk ([pandas.Series(input, index=pandas_index)], output_1d), # Series input, many chunks ([pandas.Series([x], index=[idx]) for (x, idx) in zip(input, pandas_index)], output_1d), ] if accepts_multicolumn: input_2d_2col = np.column_stack((input, input[::-1])) output_2d_2col = np.column_stack((output, output[::-1])) output_2col_dataframe = pandas.DataFrame(output_2d_2col, index=pandas_index) test_cases += [ # DataFrame input, one chunk ([pandas.DataFrame(input_2d_2col, index=pandas_index)], output_2col_dataframe), # DataFrame input, many chunks ([pandas.DataFrame([input_2d_2col[i, :]], index=[pandas_index[i]]) for i in range(len(input))], output_2col_dataframe), ] for input_obj, output_obj in test_cases: print(input_obj) t = cls() for input_chunk in input_obj: t.memorize_chunk(input_chunk, *args, **kwargs) t.memorize_finish() all_outputs = [] for input_chunk in input_obj: output_chunk = t.transform(input_chunk, *args, **kwargs) if input.ndim == output.ndim: assert output_chunk.ndim == np.asarray(input_chunk).ndim all_outputs.append(output_chunk) if have_pandas and isinstance(all_outputs[0], pandas_type): all_output1 = pandas.concat(all_outputs) assert np.array_equal(all_output1.index, pandas_index) elif all_outputs[0].ndim == 0: all_output1 = np.array(all_outputs) elif all_outputs[0].ndim == 1: all_output1 = np.concatenate(all_outputs) else: all_output1 = np.row_stack(all_outputs) assert all_output1.shape[0] == len(input) # output_obj_reshaped = np.asarray(output_obj).reshape(all_output1.shape) # assert np.allclose(all_output1, output_obj_reshaped) assert np.allclose(all_output1, output_obj) if np.asarray(input_obj[0]).ndim == 0: all_input = np.array(input_obj) elif have_pandas and isinstance(input_obj[0], pandas_type): # handles both Series and DataFrames all_input = pandas.concat(input_obj) elif np.asarray(input_obj[0]).ndim == 1: # Don't use row_stack, because that would turn this into a 1xn # matrix: all_input = np.concatenate(input_obj) else: all_input = np.row_stack(input_obj) all_output2 = t.transform(all_input, *args, **kwargs) if have_pandas and isinstance(input_obj[0], pandas_type): assert np.array_equal(all_output2.index, pandas_index) if input.ndim == output.ndim: assert all_output2.ndim == all_input.ndim assert np.allclose(all_output2, output_obj)
def test__subterm_column_names_iter_and__build_subterm(): import pytest from patsy.contrasts import ContrastMatrix from patsy.categorical import C f1 = _MockFactor("f1") f2 = _MockFactor("f2") f3 = _MockFactor("f3") contrast = ContrastMatrix(np.array([[0, 0.5], [3, 0]]), ["[c1]", "[c2]"]) factor_infos1 = {f1: FactorInfo(f1, "numerical", {}, num_columns=1, categories=None), f2: FactorInfo(f2, "categorical", {}, num_columns=None, categories=["a", "b"]), f3: FactorInfo(f3, "numerical", {}, num_columns=1, categories=None), } contrast_matrices = {f2: contrast} subterm1 = SubtermInfo([f1, f2, f3], contrast_matrices, 2) assert (list(_subterm_column_names_iter(factor_infos1, subterm1)) == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]) mat = np.empty((3, 2)) _build_subterm(subterm1, factor_infos1, {f1: atleast_2d_column_default([1, 2, 3]), f2: np.asarray([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12])}, mat) assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], [0, 0.5 * 2 * 2], [3 * 3 * -12, 0]]) # Check that missing categorical values blow up pytest.raises(PatsyError, _build_subterm, subterm1, factor_infos1, {f1: atleast_2d_column_default([1, 2, 3]), f2: np.asarray([0, -1, 1]), f3: atleast_2d_column_default([7.5, 2, -12])}, mat) factor_infos2 = dict(factor_infos1) factor_infos2[f1] = FactorInfo(f1, "numerical", {}, num_columns=2, categories=None) subterm2 = SubtermInfo([f1, f2, f3], contrast_matrices, 4) assert (list(_subterm_column_names_iter(factor_infos2, subterm2)) == ["f1[0]:f2[c1]:f3", "f1[1]:f2[c1]:f3", "f1[0]:f2[c2]:f3", "f1[1]:f2[c2]:f3"]) mat2 = np.empty((3, 4)) _build_subterm(subterm2, factor_infos2, {f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), f2: np.asarray([0, 0, 1]), f3: atleast_2d_column_default([7.5, 2, -12])}, mat2) assert np.allclose(mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], [3 * 5 * -12, 3 * 6 * -12, 0, 0]]) subterm_int = SubtermInfo([], {}, 1) assert list(_subterm_column_names_iter({}, subterm_int)) == ["Intercept"] mat3 = np.empty((3, 1)) _build_subterm(subterm_int, {}, {f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, mat3) assert np.allclose(mat3, 1)