def _prepare_data_from_formula( formula: str, data: DataFrame, portfolios: DataFrame) -> Tuple[DataFrame, DataFrame, str]: orig_formula = formula na_action = NAAction("raise") if portfolios is not None: factors_mm = model_matrix( formula + " + 0", data, context=0, # TODO: self._eval_env, ensure_full_rank=True, na_action=na_action, ) factors = DataFrame(factors_mm) else: formula_components = formula.split("~") portfolios_mm = model_matrix( formula_components[0].strip() + " + 0", data, context=0, # TODO: self._eval_env, ensure_full_rank=False, na_action=na_action, ) portfolios = DataFrame(portfolios_mm) factors_mm = model_matrix( formula_components[1].strip() + " + 0", data, context=0, # TODO: self._eval_env, ensure_full_rank=False, na_action=na_action, ) factors = DataFrame(factors_mm) return factors, portfolios, orig_formula
def test_model_matrix(self, data): def local_test(x): return x**2 r = model_matrix('0 + global_test(a) + local_test(b)', data) assert list(r['global_test(a)']) == [1, 4, 9] assert list(r['local_test(b)']) == [16, 25, 36] with pytest.raises(NameError): model_matrix('0 + global_test(a) + local_test(b)', data, context=None)
def _transform(self, var, by): if not isinstance(var, SimpleVariable): self._densify_variables() # Set up all the splitting variables as a DF. Note that variables in # 'by' can be either regular variables, or entities in the index--so # we need to check both places. all_variables = self._variables by_variables = [ all_variables[v].values if v in all_variables else var.index[v].reset_index(drop=True) for v in listify(by) ] group_data = pd.concat(by_variables, axis=1, sort=True) group_data.columns = listify(by) # Use formulaic to create splitting design matrix group_data = group_data.astype(str) formula = '0+' + ':'.join(listify(by)) dm = model_matrix(formula, data=group_data) dm.columns = [col.replace(':', '.') for col in dm.columns] # formulaic output naming convention differs from patsy dm.columns = [col.replace('[T.', '[') for col in dm.columns] return var.split(dm)
def instruments(self) -> OptionalDataFrame: """Instruments""" instr_fmla = "0 +" + self.components["instruments"] instr = model_matrix( instr_fmla, self._data, context=self._eval_env, ensure_full_rank=False, na_action=fNAAction("raise"), ) return self._empty_check(DataFrame(instr))
def endog(self) -> OptionalDataFrame: """Endogenous variables""" endog_fmla = "0 +" + self.components["endog"] endog = model_matrix( endog_fmla, self._data, context=self._eval_env, ensure_full_rank=False, na_action=fNAAction("raise"), ) return self._empty_check(DataFrame(endog))
def dependent(self) -> DataFrame: """Dependent variable""" dep_fmla = self.components["dependent"] dep = model_matrix( dep_fmla, self._data, context=self._eval_env, ensure_full_rank=False, na_action=fNAAction("raise"), ) return DataFrame(dep)
def from_df(cls, df, model, metadata=None, formula=None): """ Initialize a GLMMSpec instance from a BIDSVariableCollection and a BIDS-StatsModels JSON spec. Parameters ---------- df : DataFrame A pandas DataFrame containing predictor information (i.e., the fixed component of the design matrix). model : dict The "Model" section from a BIDS-StatsModel specification. metadata: DataFrame Optional DataFrame containing additional columns that are not part of the design matrix but may have downstream informational use and/or contain variables needed to define random effects. Rows must map 1-to-1 with those in `df`. formula: str Optional Wilkinson (R-style) formula specifying the fixed (X) part of the design matrix. All variables referenced in the formula must be present as columns in `df`. Output names will follow the conventions specified in the `formulaic` documentation. Note that only the right-hand part of the formula should be passed (i.e., pass "X1 * X2", not "y ~ X1 * X2"). If provided, willl take precedence over any formula found in the `model`. Returns ------- A GLMMSpec instance. """ kwargs = {} # Fixed terms model = convert_JSON(model) formula = formula or model.get('formula') if formula is not None: df = model_matrix(formula, df) kwargs['X'] = df # Variance components vcs = model.get('variance_components', []) Z_list = [] if vcs: # VCs can be defined by variables in either the fixed predictor # DF or the supplementary metadata DF, so concatenate them. all_vars = [df, metadata] if metadata is not None else [df] all_vars = pd.concat(all_vars, axis=1) for vc in vcs: # Levels can either be defined by the levels of a single # categorical ("LevelsFrom") or by a set of binary variables. if 'levels_from' in vc: data = all_vars[vc['levels_from']].values Z_list.append(pd.get_dummies(data).values) else: df = all_vars.loc[:, vc['levels']] Z_list.append(df.values) Z = np.concatenate(Z_list, axis=1) groups = np.zeros((Z.shape[1], len(Z_list))) c = 0 for i, vc in enumerate(Z_list): n = vc.shape[1] groups[c:(c+n), i] = 1 c += n groups = pd.DataFrame(groups, columns=[vc['name'] for vc in vcs]) kwargs['Z'] = Z kwargs['groups'] = groups error = model.get('error') if error: kwargs['family'] = error.get('family') kwargs['link'] = error.get('link') return GLMMSpec(**kwargs)