def check_result(expect_builders, lhs, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names): # pragma: no cover assert np.allclose(rhs, expected_rhs_values) assert rhs.design_info.column_names == expected_rhs_names if lhs is not None: assert np.allclose(lhs, expected_lhs_values) assert lhs.design_info.column_names == expected_lhs_names else: assert expected_lhs_values is None assert expected_lhs_names is None if expect_builders: if lhs is None: new_rhs, = build_design_matrices([rhs.design_info.builder], data) else: new_lhs, new_rhs = build_design_matrices([lhs.design_info.builder, rhs.design_info.builder], data) assert np.allclose(new_lhs, lhs) assert new_lhs.design_info.column_names == expected_lhs_names assert np.allclose(new_rhs, rhs) assert new_rhs.design_info.column_names == expected_rhs_names else: assert rhs.design_info.builder is None assert lhs is None or lhs.design_info.builder is None
def check_result(expect_builders, lhs, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names): # pragma: no cover assert np.allclose(rhs, expected_rhs_values) assert rhs.design_info.column_names == expected_rhs_names if lhs is not None: assert np.allclose(lhs, expected_lhs_values) assert lhs.design_info.column_names == expected_lhs_names else: assert expected_lhs_values is None assert expected_lhs_names is None if expect_builders: if lhs is None: new_rhs, = build_design_matrices([rhs.design_info.builder], data) else: new_lhs, new_rhs = build_design_matrices( [lhs.design_info.builder, rhs.design_info.builder], data) assert np.allclose(new_lhs, lhs) assert new_lhs.design_info.column_names == expected_lhs_names assert np.allclose(new_rhs, rhs) assert new_rhs.design_info.column_names == expected_rhs_names else: assert rhs.design_info.builder is None assert lhs is None or lhs.design_info.builder is None
def test_incremental(): # incr_dbuilder(s) # stateful transformations datas = [{"a": ["a2", "a2", "a2"], "x": [1, 2, 3]}, {"a": ["a2", "a2", "a1"], "x": [4, 5, 6]}] x = np.asarray([1, 2, 3, 4, 5, 6]) sin_center_x = np.sin(x - np.mean(x)) x_col = sin_center_x - np.mean(sin_center_x) def data_iter_maker(): return iter(datas) builders = incr_dbuilders("1 ~ a + center(np.sin(center(x)))", data_iter_maker) lhs, rhs = build_design_matrices(builders, datas[1]) assert lhs.design_info.column_names == ["Intercept"] assert rhs.design_info.column_names == ["Intercept", "a[T.a2]", "center(np.sin(center(x)))"] assert np.allclose(lhs, [[1], [1], [1]]) assert np.allclose(rhs, np.column_stack(([1, 1, 1], [1, 1, 0], x_col[3:]))) builder = incr_dbuilder("~ a + center(np.sin(center(x)))", data_iter_maker) (rhs,) = build_design_matrices([builder], datas[1]) assert rhs.design_info.column_names == ["Intercept", "a[T.a2]", "center(np.sin(center(x)))"] assert np.allclose(lhs, [[1], [1], [1]]) assert np.allclose(rhs, np.column_stack(([1, 1, 1], [1, 1, 0], x_col[3:]))) assert_raises(PatsyError, incr_dbuilder, "x ~ x", data_iter_maker) assert_raises(PatsyError, incr_dbuilders, "x", data_iter_maker)
def test_0d_data(): # Use case from statsmodels/statsmodels#1881 data_0d = {"x1": 1.1, "x2": 1.2, "a": "a1"} for formula, expected in [("x1 + x2", [[1, 1.1, 1.2]]), ("C(a, levels=('a1', 'a2')) + x1", [[1, 0, 1.1]])]: mat = dmatrix(formula, data_0d) assert np.allclose(mat, expected) assert np.allclose(build_design_matrices([mat.design_info], data_0d)[0], expected) if have_pandas: data_series = pandas.Series(data_0d) assert np.allclose(dmatrix(formula, data_series), expected) assert np.allclose(build_design_matrices([mat.design_info], data_series)[0], expected)
def test_evalfactor_reraise(): # From issue #11: env = EvalEnvironment.capture() data = {"X" : [0,1,2,3], "Y" : [1,2,3,4]} formula = "C(X) + Y" new_data = {"X" : [0,0,1,2,3,3,4], "Y" : [1,2,3,4,5,6,7]} info = dmatrix(formula, data) # This will produce a PatsyError, which is originally raised within the # call to C() (which has no way to know where it is being called # from). But EvalFactor should notice this, and add a useful origin: try: build_design_matrices([info.design_info.builder], new_data) except PatsyError, e: assert e.origin == Origin(formula, 0, 4)
def test_0d_data(): # Use case from statsmodels/statsmodels#1881 data_0d = {"x1": 1.1, "x2": 1.2, "a": "a1"} for formula, expected in [ ("x1 + x2", [[1, 1.1, 1.2]]), ("C(a, levels=('a1', 'a2')) + x1", [[1, 0, 1.1]]), ]: mat = dmatrix(formula, data_0d) assert np.allclose(mat, expected) assert np.allclose( build_design_matrices([mat.design_info], data_0d)[0], expected) if have_pandas: data_series = pandas.Series(data_0d) assert np.allclose(dmatrix(formula, data_series), expected) assert np.allclose( build_design_matrices([mat.design_info], data_series)[0], expected)
def test_incremental(): # incr_dbuilder(s) # stateful transformations datas = [ { "a": ["a2", "a2", "a2"], "x": [1, 2, 3] }, { "a": ["a2", "a2", "a1"], "x": [4, 5, 6] }, ] x = np.asarray([1, 2, 3, 4, 5, 6]) sin_center_x = np.sin(x - np.mean(x)) x_col = sin_center_x - np.mean(sin_center_x) def data_iter_maker(): return iter(datas) builders = incr_dbuilders("1 ~ a + center(np.sin(center(x)))", data_iter_maker) lhs, rhs = build_design_matrices(builders, datas[1]) assert lhs.design_info.column_names == ["Intercept"] assert rhs.design_info.column_names == [ "Intercept", "a[T.a2]", "center(np.sin(center(x)))" ] assert np.allclose(lhs, [[1], [1], [1]]) assert np.allclose(rhs, np.column_stack(([1, 1, 1], [1, 1, 0], x_col[3:]))) builder = incr_dbuilder("~ a + center(np.sin(center(x)))", data_iter_maker) (rhs, ) = build_design_matrices([builder], datas[1]) assert rhs.design_info.column_names == [ "Intercept", "a[T.a2]", "center(np.sin(center(x)))" ] assert np.allclose(lhs, [[1], [1], [1]]) assert np.allclose(rhs, np.column_stack(([1, 1, 1], [1, 1, 0], x_col[3:]))) assert_raises(PatsyError, incr_dbuilder, "x ~ x", data_iter_maker) assert_raises(PatsyError, incr_dbuilders, "x", data_iter_maker)
def t(formula_like, data, depth, expect_builders, expected_rhs_values, expected_rhs_names, expected_lhs_values=None, expected_lhs_names=None): # pragma: no cover if isinstance(depth, int): depth += 1 def data_iter_maker(): return iter([data]) if (isinstance(formula_like, (basestring, ModelDesc, DesignMatrixBuilder)) or (isinstance(formula_like, tuple) and isinstance(formula_like[0], DesignMatrixBuilder)) or hasattr(formula_like, "__patsy_get_model_desc__")): if expected_lhs_values is None: builder = incr_dbuilder(formula_like, data_iter_maker, depth) lhs = None (rhs,) = build_design_matrices([builder], data) else: builders = incr_dbuilders(formula_like, data_iter_maker, depth) lhs, rhs = build_design_matrices(builders, data) check_result(expect_builders, lhs, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names) else: assert_raises(PatsyError, incr_dbuilders, formula_like, data_iter_maker) assert_raises(PatsyError, incr_dbuilder, formula_like, data_iter_maker) one_mat_fs = [dmatrix] two_mat_fs = [dmatrices] if have_pandas: one_mat_fs.append(dmatrix_pandas) two_mat_fs.append(dmatrices_pandas) if expected_lhs_values is None: for f in one_mat_fs: rhs = f(formula_like, data, depth) check_result(expect_builders, None, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names) # We inline assert_raises here to avoid complications with the # depth argument. for f in two_mat_fs: try: f(formula_like, data, depth) except PatsyError: pass else: raise AssertionError else: for f in one_mat_fs: try: f(formula_like, data, depth) except PatsyError: pass else: raise AssertionError for f in two_mat_fs: (lhs, rhs) = f(formula_like, data, depth) check_result(expect_builders, lhs, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names)
def _do_highlevel_design(formula_like, data, eval_env, return_type): if return_type == "dataframe" and not have_pandas: raise PatsyError("pandas.DataFrame was requested, but pandas " "is not installed") if return_type not in ("matrix", "dataframe"): raise PatsyError("unrecognized output type %r, should be " "'matrix' or 'dataframe'" % (return_type,)) def data_iter_maker(): return iter([data]) builders = _try_incr_builders(formula_like, data_iter_maker, eval_env) if builders is not None: return build_design_matrices(builders, data, return_type=return_type) else: # No builders, but maybe we can still get matrices if isinstance(formula_like, tuple): if len(formula_like) != 2: raise PatsyError("don't know what to do with a length %s " "matrices tuple" % (len(formula_like),)) (lhs, rhs) = formula_like else: # subok=True is necessary here to allow DesignMatrixes to pass # through (lhs, rhs) = (None, asarray_or_pandas(formula_like, subok=True)) # some sort of explicit matrix or matrices were given. Currently we # have them in one of these forms: # -- an ndarray or subclass # -- a DesignMatrix # -- a pandas.Series # -- a pandas.DataFrame # and we have to produce a standard output format. def _regularize_matrix(m, default_column_prefix): di = DesignInfo.from_array(m, default_column_prefix) if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)): orig_index = m.index else: orig_index = None if return_type == "dataframe": m = atleast_2d_column_default(m, preserve_pandas=True) m = pandas.DataFrame(m) m.columns = di.column_names m.design_info = di return (m, orig_index) else: return (DesignMatrix(m, di), orig_index) rhs, rhs_orig_index = _regularize_matrix(rhs, "x") if lhs is None: lhs = np.zeros((rhs.shape[0], 0), dtype=float) lhs, lhs_orig_index = _regularize_matrix(lhs, "y") assert isinstance(getattr(lhs, "design_info", None), DesignInfo) assert isinstance(getattr(rhs, "design_info", None), DesignInfo) if lhs.shape[0] != rhs.shape[0]: raise PatsyError( "shape mismatch: outcome matrix has %s rows, " "predictor matrix has %s rows" % (lhs.shape[0], rhs.shape[0]) ) if rhs_orig_index is not None and lhs_orig_index is not None: if not rhs_orig_index.equals(lhs_orig_index): raise PatsyError("index mismatch: outcome and " "predictor have incompatible indexes") if return_type == "dataframe": if rhs_orig_index is not None and lhs_orig_index is None: lhs.index = rhs.index if rhs_orig_index is None and lhs_orig_index is not None: rhs.index = lhs.index return (lhs, rhs)
def t(formula_like, data, depth, expect_builders, expected_rhs_values, expected_rhs_names, expected_lhs_values=None, expected_lhs_names=None): # pragma: no cover if isinstance(depth, int): depth += 1 def data_iter_maker(): return iter([data]) if (isinstance(formula_like, (basestring, ModelDesc, DesignMatrixBuilder)) or (isinstance(formula_like, tuple) and isinstance(formula_like[0], DesignMatrixBuilder)) or hasattr(formula_like, "__patsy_get_model_desc__")): if expected_lhs_values is None: builder = incr_dbuilder(formula_like, data_iter_maker, depth) lhs = None (rhs, ) = build_design_matrices([builder], data) else: builders = incr_dbuilders(formula_like, data_iter_maker, depth) lhs, rhs = build_design_matrices(builders, data) check_result(expect_builders, lhs, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names) else: assert_raises(PatsyError, incr_dbuilders, formula_like, data_iter_maker) assert_raises(PatsyError, incr_dbuilder, formula_like, data_iter_maker) one_mat_fs = [dmatrix] two_mat_fs = [dmatrices] if have_pandas: one_mat_fs.append(dmatrix_pandas) two_mat_fs.append(dmatrices_pandas) if expected_lhs_values is None: for f in one_mat_fs: rhs = f(formula_like, data, depth) check_result(expect_builders, None, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names) # We inline assert_raises here to avoid complications with the # depth argument. for f in two_mat_fs: try: f(formula_like, data, depth) except PatsyError: pass else: raise AssertionError else: for f in one_mat_fs: try: f(formula_like, data, depth) except PatsyError: pass else: raise AssertionError for f in two_mat_fs: (lhs, rhs) = f(formula_like, data, depth) check_result(expect_builders, lhs, rhs, data, expected_rhs_values, expected_rhs_names, expected_lhs_values, expected_lhs_names)
def _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type): if return_type == "dataframe" and not have_pandas: raise PatsyError("pandas.DataFrame was requested, but pandas " "is not installed") if return_type not in ("matrix", "dataframe"): raise PatsyError("unrecognized output type %r, should be " "'matrix' or 'dataframe'" % (return_type, )) def data_iter_maker(): return iter([data]) builders = _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action) if builders is not None: return build_design_matrices(builders, data, NA_action=NA_action, return_type=return_type) else: # No builders, but maybe we can still get matrices if isinstance(formula_like, tuple): if len(formula_like) != 2: raise PatsyError("don't know what to do with a length %s " "matrices tuple" % (len(formula_like), )) (lhs, rhs) = formula_like else: # subok=True is necessary here to allow DesignMatrixes to pass # through (lhs, rhs) = (None, asarray_or_pandas(formula_like, subok=True)) # some sort of explicit matrix or matrices were given. Currently we # have them in one of these forms: # -- an ndarray or subclass # -- a DesignMatrix # -- a pandas.Series # -- a pandas.DataFrame # and we have to produce a standard output format. def _regularize_matrix(m, default_column_prefix): di = DesignInfo.from_array(m, default_column_prefix) if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)): orig_index = m.index else: orig_index = None if return_type == "dataframe": m = atleast_2d_column_default(m, preserve_pandas=True) m = pandas.DataFrame(m) m.columns = di.column_names m.design_info = di return (m, orig_index) else: return (DesignMatrix(m, di), orig_index) rhs, rhs_orig_index = _regularize_matrix(rhs, "x") if lhs is None: lhs = np.zeros((rhs.shape[0], 0), dtype=float) lhs, lhs_orig_index = _regularize_matrix(lhs, "y") assert isinstance(getattr(lhs, "design_info", None), DesignInfo) assert isinstance(getattr(rhs, "design_info", None), DesignInfo) if lhs.shape[0] != rhs.shape[0]: raise PatsyError("shape mismatch: outcome matrix has %s rows, " "predictor matrix has %s rows" % (lhs.shape[0], rhs.shape[0])) if rhs_orig_index is not None and lhs_orig_index is not None: if not rhs_orig_index.equals(lhs_orig_index): raise PatsyError("index mismatch: outcome and " "predictor have incompatible indexes") if return_type == "dataframe": if rhs_orig_index is not None and lhs_orig_index is None: lhs.index = rhs.index if rhs_orig_index is None and lhs_orig_index is not None: rhs.index = lhs.index return (lhs, rhs)