def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) and isinstance(formula_like[1], DesignInfo)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like,)) # fallthrough if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) return design_matrix_builders([formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, eval_env, NA_action) else: return None
def _try_incr_builders(formula_like, data_iter_maker, eval_env): if isinstance(formula_like, DesignMatrixBuilder): return (design_matrix_builders([[]], data_iter_maker)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignMatrixBuilder) and isinstance(formula_like[1], DesignMatrixBuilder)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like,)) # fallthrough if isinstance(formula_like, basestring): eval_env = _get_env(eval_env) formula_like = ModelDesc.from_formula(formula_like, eval_env) # fallthrough if isinstance(formula_like, ModelDesc): return design_matrix_builders([formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker) else: return None
def eval_bar(evaluator, tree): """Evaluation function for the bar operator AST node.""" assert len(tree.args) == 2 expr_node, factor_node = tree.args # create model description for the expression left of the bar expr_node = ParseNode("~", None, [expr_node], expr_node.origin) expr_md = ModelDesc.from_formula(expr_node) # create model description for grouping factor right of the bar factor_node = ParseNode("~", None, [factor_node], factor_node.origin) factor_md = ModelDesc.from_formula(factor_node) factor_md.rhs_termlist.remove(INTERCEPT) # combine these in a random effects term ret = RandomEffectsTerm(expr=expr_md, factor=factor_md) # return corresponding intermediate expression return IntermediateExpr(False, None, False, [ret])
def parse_formula(form): # use patsy for formula parse desc = ModelDesc.from_formula(form) # convert to string lists y_terms = [parse_term(t) for t in desc.lhs_termlist] x_terms = [parse_term(t) for t in desc.rhs_termlist] x_class = [classify_term(t) for t in x_terms] # separate into components y = squeeze_term(y_terms[0]) x = [squeeze_term(t) for t, c in zip(x_terms, x_class) if c == 'continuous'] fe = [squeeze_term(strip_cat(t)) for t, c in zip(x_terms, x_class) if c == 'categorical'] intercept = any([c == 'intercept' for c in x_class]) return y, x, fe, intercept
def get_matrices(data, formula, env=0): """Given the data and a formula, build Z and X matrices.""" model_description = evaluate_formula(formula) fixef_terms, randef_terms = [], [] for term in model_description.rhs_termlist: if isinstance(term, RandomEffectsTerm): randef_terms.append(term) else: fixef_terms.append(term) Zis = [] Lambdatis = [] thetais = [] ps = [] ls = [] for ret in randef_terms: X = dmatrix(ret.expr, data, env) J = dmatrix(ret.factor, data, env) _, p = X.shape _, l = J.shape ps.append(p) ls.append(l) Zis.append(buildzi(X, J)) Lambdati, thetai = buildlambdati(p, l) Lambdatis.append(Lambdati) thetais.append(thetai) Lind = buildlind(ps, ls) def thfun(theta): return theta[Lind] Z = hstack(Zis).T Lambdat = block_diag(Lambdatis, format='csc') y, X = dmatrices(ModelDesc(model_description.lhs_termlist, fixef_terms), data) y = np.asarray(y) X = np.asarray(X) # initial value of theta theta0 = np.concatenate(thetais) return X, Z, Lambdat, y, theta0, thfun
def parse_formula(form): try: from patsy.desc import ModelDesc except: print('Please install patsy for formula parsing') return # use patsy for formula parse desc = ModelDesc.from_formula(form) lhs, rhs = desc.lhs_termlist, desc.rhs_termlist # convert to string lists x_terms = Formula(*[parse_term(t) for t in rhs]) if len(lhs) > 0: y_terms = parse_factor(lhs[0].factors[0]) return y_terms, x_terms else: return x_terms
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) and isinstance(formula_like[1], DesignInfo)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like,)) # fallthrough if not six.PY3 and isinstance(formula_like, unicode): # Included for the convenience of people who are using py2 with # __future__.unicode_literals. try: formula_like = formula_like.encode("ascii") except UnicodeEncodeError: raise PatsyError( "On Python 2, formula strings must be either 'str' objects, " "or else 'unicode' objects containing only ascii " "characters. You passed a unicode string with non-ascii " "characters. I'm afraid you'll have to either switch to " "ascii-only, or else upgrade to Python 3.") if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) return design_matrix_builders([formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, eval_env, NA_action) else: return None
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignInfo) and isinstance(formula_like[1], DesignInfo)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like, )) # fallthrough if not six.PY3 and isinstance(formula_like, unicode): # Included for the convenience of people who are using py2 with # __future__.unicode_literals. try: formula_like = formula_like.encode("ascii") except UnicodeEncodeError: raise PatsyError( "On Python 2, formula strings must be either 'str' objects, " "or else 'unicode' objects containing only ascii " "characters. You passed a unicode string with non-ascii " "characters. I'm afraid you'll have to either switch to " "ascii-only, or else upgrade to Python 3.") if isinstance(formula_like, str): formula_like = ModelDesc.from_formula(formula_like) # fallthrough if isinstance(formula_like, ModelDesc): assert isinstance(eval_env, EvalEnvironment) return design_matrix_builders( [formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, eval_env, NA_action) else: return None
def _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action): if isinstance(formula_like, DesignMatrixBuilder): return (design_matrix_builders([[]], data_iter_maker, NA_action)[0], formula_like) if (isinstance(formula_like, tuple) and len(formula_like) == 2 and isinstance(formula_like[0], DesignMatrixBuilder) and isinstance(formula_like[1], DesignMatrixBuilder)): return formula_like if hasattr(formula_like, "__patsy_get_model_desc__"): formula_like = formula_like.__patsy_get_model_desc__(eval_env) if not isinstance(formula_like, ModelDesc): raise PatsyError("bad value from %r.__patsy_get_model_desc__" % (formula_like, )) # fallthrough if isinstance(formula_like, basestring): assert isinstance(eval_env, EvalEnvironment) formula_like = ModelDesc.from_formula(formula_like, eval_env) # fallthrough if isinstance(formula_like, ModelDesc): return design_matrix_builders( [formula_like.lhs_termlist, formula_like.rhs_termlist], data_iter_maker, NA_action) else: return None
def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])])
def test_formula_likes(): # Plain array-like, rhs only t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) # Plain array-likes, lhs and rhs t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") y_dm = DesignMatrix([1, 2], default_column_prefix="bar") t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"]) # number of rows must match t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0) # tuples must have the right size t_invalid(([[1, 2, 3]], ), {}, 0) t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0) # plain Series and DataFrames if have_pandas: # Names are extracted t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]], ["x"]) t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]], ["asdf"]) t((pandas.DataFrame({"y": [4, 5, 6] }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) t((pandas.Series([4, 5, 6], name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) # Or invented t((pandas.DataFrame([[4, 5, 6]]), pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False, [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"]) t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"]) # indices must match t_invalid((pandas.DataFrame( [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0) # Foreign ModelDesc factories class ForeignModelSource(object): def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])]) foreign_model = ForeignModelSource() t(foreign_model, { "Y": [1, 2], "X": [[1, 2], [3, 4]] }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"]) class BadForeignModelSource(object): def __patsy_get_model_desc__(self, data): return data t_invalid(BadForeignModelSource(), {}, 0) # string formulas t("y ~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"]) t("~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"]) t("x + y", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"]) desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"]) desc = ModelDesc([Term([LookupFactor("y")])], [Term([]), Term([LookupFactor("x")])]) t(desc, { "x": [1.5, 2.5, 3.5], "y": [10, 20, 30] }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"], [[10], [20], [30]], ["y"]) # builders termlists = ( [], [Term([LookupFactor("x")])], [Term([]), Term([LookupFactor("x")])], ) builders = design_matrix_builders(termlists, lambda: iter([{ "x": [1, 2, 3] }])) # twople but with no LHS t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # single DesignMatrixBuilder t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # twople with LHS t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]], ["x"]) # check depth arguments x_in_env = [1, 2, 3] t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"]) # Trying to pull x_in_env out of our *caller* shouldn't work. t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError)) # But then again it should, if called from one down on the stack: def check_nested_call(): x_in_env = "asdf" t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call() # passing in an explicit EvalEnvironment also works: e = EvalEnvironment.capture(1) t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError)) e = EvalEnvironment.capture(0) def check_nested_call_2(): x_in_env = "asdf" t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call_2()
def subset(self, which_terms): """Create a new :class:`DesignInfo` for design matrices that contain a subset of the terms that the current :class:`DesignInfo` does. For example, if ``design_info`` has terms ``x``, ``y``, and ``z``, then:: design_info2 = design_info.subset(["x", "z"]) will return a new DesignInfo that can be used to construct design matrices with only the columns corresponding to the terms ``x`` and ``z``. After we do this, then in general these two expressions will return the same thing (here we assume that ``x``, ``y``, and ``z`` each generate a single column of the output):: build_design_matrix([design_info], data)[0][:, [0, 2]] build_design_matrix([design_info2], data)[0] However, a critical difference is that in the second case, ``data`` need not contain any values for ``y``. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for ``y``. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use ``0`` or ``-1`` in your formula if you want to avoid this. This method can also be used to reorder the terms in your design matrix, in case you want to do that for some reason. I can't think of any. Note that this method will generally *not* produce the same result as creating a new model directly. Consider these DesignInfo objects:: design1 = dmatrix("1 + C(a)", data) design2 = design1.subset("0 + C(a)") design3 = dmatrix("0 + C(a)", data) Here ``design2`` and ``design3`` will both produce design matrices that contain an encoding of ``C(a)`` without any intercept term. But ``design3`` uses a full-rank encoding for the categorical term ``C(a)``, while ``design2`` uses the same reduced-rank encoding as ``design1``. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 New method on the class DesignMatrixBuilder. .. versionchanged: 0.4.0 Moved from DesignMatrixBuilder to DesignInfo, as part of the removal of DesignMatrixBuilder. """ if isinstance(which_terms, str): desc = ModelDesc.from_formula(which_terms) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] if self.term_codings is None: # This is a minimal DesignInfo # If the name is unknown we just let the KeyError escape new_names = [] for t in which_terms: new_names += self.column_names[self.term_name_slices[t]] return DesignInfo(new_names) else: term_name_to_term = {} for term in self.term_codings: term_name_to_term[term.name()] = term new_column_names = [] new_factor_infos = {} new_term_codings = OrderedDict() for name_or_term in which_terms: term = term_name_to_term.get(name_or_term, name_or_term) # If the name is unknown we just let the KeyError escape s = self.term_slices[term] new_column_names += self.column_names[s] for f in term.factors: new_factor_infos[f] = self.factor_infos[f] new_term_codings[term] = self.term_codings[term] return DesignInfo(new_column_names, factor_infos=new_factor_infos, term_codings=new_term_codings)
def subset(self, which_terms): """Create a new :class:`DesignMatrixBuilder` that includes only a subset of the terms that this object does. For example, if `builder` has terms `x`, `y`, and `z`, then:: builder2 = builder.subset(["x", "z"]) will return a new builder that will return design matrices with only the columns corresponding to the terms `x` and `z`. After we do this, then in general these two expressions will return the same thing (here we assume that `x`, `y`, and `z` each generate a single column of the output):: build_design_matrix([builder], data)[0][:, [0, 2]] build_design_matrix([builder2], data)[0] However, a critical difference is that in the second case, `data` need not contain any values for `y`. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for `y`. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use `0` or `-1` in your formula if you want to avoid this. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 """ factor_to_evaluators = {} for evaluator in self._evaluators: factor_to_evaluators[evaluator.factor] = evaluator design_info = self.design_info term_name_to_term = dict(zip(design_info.term_names, design_info.terms)) if isinstance(which_terms, str): # We don't use this EvalEnvironment -- all we want to do is to # find matching terms, and we can't do that use == on Term # objects, because that calls == on factor objects, which in turn # compares EvalEnvironments. So all we do with the parsed formula # is pull out the term *names*, which the EvalEnvironment doesn't # effect. This is just a placeholder then to allow the ModelDesc # to be created: env = EvalEnvironment({}) desc = ModelDesc.from_formula(which_terms, env) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] terms = [] evaluators = set() term_to_column_builders = {} for term_or_name in which_terms: if isinstance(term_or_name, six.string_types): if term_or_name not in term_name_to_term: raise PatsyError("requested term %r not found in " "this DesignMatrixBuilder" % (term_or_name,)) term = term_name_to_term[term_or_name] else: term = term_or_name if term not in self._termlist: raise PatsyError("requested term '%s' not found in this " "DesignMatrixBuilder" % (term,)) for factor in term.factors: evaluators.add(factor_to_evaluators[factor]) terms.append(term) column_builder = self._term_to_column_builders[term] term_to_column_builders[term] = column_builder return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)
def subset(self, which_terms): """Create a new :class:`DesignMatrixBuilder` that includes only a subset of the terms that this object does. For example, if `builder` has terms `x`, `y`, and `z`, then:: builder2 = builder.subset(["x", "z"]) will return a new builder that will return design matrices with only the columns corresponding to the terms `x` and `z`. After we do this, then in general these two expressions will return the same thing (here we assume that `x`, `y`, and `z` each generate a single column of the output):: build_design_matrix([builder], data)[0][:, [0, 2]] build_design_matrix([builder2], data)[0] However, a critical difference is that in the second case, `data` need not contain any values for `y`. This is very useful when doing prediction using a subset of a model, in which situation R usually forces you to specify dummy values for `y`. If using a formula to specify the terms to include, remember that like any formula, the intercept term will be included by default, so use `0` or `-1` in your formula if you want to avoid this. :arg which_terms: The terms which should be kept in the new :class:`DesignMatrixBuilder`. If this is a string, then it is parsed as a formula, and then the names of the resulting terms are taken as the terms to keep. If it is a list, then it can contain a mixture of term names (as strings) and :class:`Term` objects. .. versionadded: 0.2.0 """ factor_to_evaluators = {} for evaluator in self._evaluators: factor_to_evaluators[evaluator.factor] = evaluator design_info = self.design_info term_name_to_term = dict(zip(design_info.term_names, design_info.terms)) if isinstance(which_terms, basestring): # We don't use this EvalEnvironment -- all we want to do is to # find matching terms, and we can't do that use == on Term # objects, because that calls == on factor objects, which in turn # compares EvalEnvironments. So all we do with the parsed formula # is pull out the term *names*, which the EvalEnvironment doesn't # effect. This is just a placeholder then to allow the ModelDesc # to be created: env = EvalEnvironment({}) desc = ModelDesc.from_formula(which_terms, env) if desc.lhs_termlist: raise PatsyError("right-hand-side-only formula required") which_terms = [term.name() for term in desc.rhs_termlist] terms = [] evaluators = set() term_to_column_builders = {} for term_or_name in which_terms: if isinstance(term_or_name, basestring): if term_or_name not in term_name_to_term: raise PatsyError("requested term %r not found in " "this DesignMatrixBuilder" % (term_or_name, )) term = term_name_to_term[term_or_name] else: term = term_or_name if term not in self._termlist: raise PatsyError("requested term '%s' not found in this " "DesignMatrixBuilder" % (term, )) for factor in term.factors: evaluators.add(factor_to_evaluators[factor]) terms.append(term) column_builder = self._term_to_column_builders[term] term_to_column_builders[term] = column_builder return DesignMatrixBuilder(terms, evaluators, term_to_column_builders)
def group_design( spreadsheet: Path, contrasts: list[dict], variables: list[dict], subjects: list[str], ) -> tuple[dict[str, list[float]], list[tuple], list[str], list[str]]: dataframe = prepare_data_frame(spreadsheet, variables, subjects) # remove zero variance columns columns_var_gt_0 = dataframe.apply( pd.Series.nunique) > 1 # does not count NA assert isinstance(columns_var_gt_0, pd.Series) dataframe = dataframe.loc[:, columns_var_gt_0] # don't need to specify lhs lhs: list[Term] = [] # generate rhs rhs = _generate_rhs(contrasts, columns_var_gt_0) # specify patsy design matrix modelDesc = ModelDesc(lhs, rhs) dmat = dmatrix(modelDesc, dataframe, return_type="dataframe") _check_multicollinearity(dmat) # prepare lsmeans unique_values_categorical = [ (0.0, ) if is_numeric_dtype(dataframe[f]) else dataframe[f].unique() for f in dataframe.columns ] grid = pd.DataFrame(list(product(*unique_values_categorical)), columns=dataframe.columns) reference_dmat = dmatrix(dmat.design_info, grid, return_type="dataframe") # data frame to store contrasts contrast_matrices: list[tuple[str, pd.DataFrame]] = [] for field, columnslice in dmat.design_info.term_name_slices.items(): constraint = { column: 0 for column in dmat.design_info.column_names[columnslice] } contrast = dmat.design_info.linear_constraint(constraint) assert np.all(contrast.variable_names == dmat.columns) contrast_matrix = pd.DataFrame(contrast.coefs, columns=dmat.columns) if field == "Intercept": # do not capitalize field = field.lower() contrast_matrices.append((field, contrast_matrix)) for contrast in contrasts: if contrast["type"] == "t": (variable, ) = contrast["variable"] variable_levels: list[str] = list(dataframe[variable].unique()) # Generate the lsmeans matrix where there is one row for each # factor level. Each row is a contrast vector. # This contrast vector corresponds to the mean of the dependent # variable at the factor level. # For example, we would have one row that calculates the mean # for patients, and one for controls. lsmeans = pd.DataFrame(index=variable_levels, columns=dmat.columns) for level in variable_levels: reference_rows = reference_dmat.loc[grid[variable] == level] lsmeans.loc[level] = reference_rows.mean() value_dict = contrast["values"] names = [ name for name in value_dict.keys() if name in variable_levels ] values = [value_dict[name] for name in names] # If we wish to test the mean of each group against zero, # we can simply use these contrasts and be done. # To test a linear hypothesis such as patient-control=0, # which is expressed here as {"patient":1, "control":-1}, # we translate it to a contrast vector by taking the linear # combination of the lsmeans contrasts. contrast_vector = lsmeans.loc[names].mul(values, axis=0).sum() contrast_matrix = pd.DataFrame([contrast_vector], columns=dmat.columns) contrast_name = f"{contrast['name']}" contrast_matrices.append((contrast_name, contrast_matrix)) npts, nevs = dmat.shape if nevs >= npts: logger.warning("Reverting to simple intercept only design. \n" f"nevs ({nevs}) >= npts ({npts})") return intercept_only_design(len(subjects)) regressor_list = dmat.to_dict(orient="list", into=OrderedDict) contrast_list, contrast_numbers, contrast_names = _make_contrasts_list( contrast_matrices) return regressor_list, contrast_list, contrast_numbers, contrast_names
print(data[["Label", "f1", "f2", data.columns[-1]]].head()) ################################################### # Let's train a logistic regression. formula = "Label ~ {0}".format(" + ".join(data.columns[1:])) print(formula[:50] + " + ...") from microsoftml import rx_logistic_regression try: logregml = rx_logistic_regression(formula, data=data) except Exception as e: # The error is expected because patsy cannot handle # so many features. print(e) ######################################### # Let's skip patsy's parser to manually define the formula # with object `ModelDesc <http://patsy.readthedocs.io/en/latest/API-reference.html?highlight=lookupfactor#patsy.ModelDesc>`_. from patsy.desc import ModelDesc, Term from patsy.user_util import LookupFactor patsy_features = [Term([LookupFactor(n)]) for n in data.columns[1:]][:10] model_formula = ModelDesc([Term([LookupFactor("Label")])], [Term([])] + patsy_features) print(model_formula.describe() + " + ...") logregml = rx_logistic_regression(model_formula, data=data)