Ejemplo n.º 1
0
    def _prepare_data_from_formula(
            formula: str, data: DataFrame,
            portfolios: DataFrame) -> Tuple[DataFrame, DataFrame, str]:
        orig_formula = formula
        na_action = NAAction("raise")
        if portfolios is not None:
            factors_mm = model_matrix(
                formula + " + 0",
                data,
                context=0,  # TODO: self._eval_env,
                ensure_full_rank=True,
                na_action=na_action,
            )
            factors = DataFrame(factors_mm)
        else:
            formula_components = formula.split("~")
            portfolios_mm = model_matrix(
                formula_components[0].strip() + " + 0",
                data,
                context=0,  # TODO: self._eval_env,
                ensure_full_rank=False,
                na_action=na_action,
            )
            portfolios = DataFrame(portfolios_mm)
            factors_mm = model_matrix(
                formula_components[1].strip() + " + 0",
                data,
                context=0,  # TODO: self._eval_env,
                ensure_full_rank=False,
                na_action=na_action,
            )
            factors = DataFrame(factors_mm)

        return factors, portfolios, orig_formula
Ejemplo n.º 2
0
    def test_model_matrix(self, data):
        def local_test(x):
            return x**2

        r = model_matrix('0 + global_test(a) + local_test(b)', data)
        assert list(r['global_test(a)']) == [1, 4, 9]
        assert list(r['local_test(b)']) == [16, 25, 36]

        with pytest.raises(NameError):
            model_matrix('0 + global_test(a) + local_test(b)',
                         data,
                         context=None)
Ejemplo n.º 3
0
    def _transform(self, var, by):

        if not isinstance(var, SimpleVariable):
            self._densify_variables()

        # Set up all the splitting variables as a DF. Note that variables in
        # 'by' can be either regular variables, or entities in the index--so
        # we need to check both places.
        all_variables = self._variables
        by_variables = [
            all_variables[v].values
            if v in all_variables else var.index[v].reset_index(drop=True)
            for v in listify(by)
        ]
        group_data = pd.concat(by_variables, axis=1, sort=True)
        group_data.columns = listify(by)

        # Use formulaic to create splitting design matrix
        group_data = group_data.astype(str)
        formula = '0+' + ':'.join(listify(by))
        dm = model_matrix(formula, data=group_data)
        dm.columns = [col.replace(':', '.') for col in dm.columns]
        # formulaic output naming convention differs from patsy
        dm.columns = [col.replace('[T.', '[') for col in dm.columns]

        return var.split(dm)
Ejemplo n.º 4
0
 def instruments(self) -> OptionalDataFrame:
     """Instruments"""
     instr_fmla = "0 +" + self.components["instruments"]
     instr = model_matrix(
         instr_fmla,
         self._data,
         context=self._eval_env,
         ensure_full_rank=False,
         na_action=fNAAction("raise"),
     )
     return self._empty_check(DataFrame(instr))
Ejemplo n.º 5
0
 def endog(self) -> OptionalDataFrame:
     """Endogenous variables"""
     endog_fmla = "0 +" + self.components["endog"]
     endog = model_matrix(
         endog_fmla,
         self._data,
         context=self._eval_env,
         ensure_full_rank=False,
         na_action=fNAAction("raise"),
     )
     return self._empty_check(DataFrame(endog))
Ejemplo n.º 6
0
 def dependent(self) -> DataFrame:
     """Dependent variable"""
     dep_fmla = self.components["dependent"]
     dep = model_matrix(
         dep_fmla,
         self._data,
         context=self._eval_env,
         ensure_full_rank=False,
         na_action=fNAAction("raise"),
     )
     return DataFrame(dep)
Ejemplo n.º 7
0
    def from_df(cls, df, model, metadata=None, formula=None):
        """ Initialize a GLMMSpec instance from a BIDSVariableCollection and
        a BIDS-StatsModels JSON spec.

        Parameters
        ----------
        df : DataFrame
            A pandas DataFrame containing predictor information (i.e., the
            fixed component of the design matrix).
        model : dict
            The "Model" section from a BIDS-StatsModel specification.
        metadata: DataFrame
            Optional DataFrame containing additional columns that are not part
            of the design matrix but may have downstream informational use
            and/or contain variables needed to define random effects. Rows must
            map 1-to-1 with those in `df`.
        formula: str
            Optional Wilkinson (R-style) formula specifying the fixed (X) part
            of the design matrix. All variables referenced in the formula must
            be present as columns in `df`. Output names will follow the
            conventions specified in the `formulaic` documentation. Note that
            only the right-hand part of the formula should be passed (i.e.,
            pass "X1 * X2", not "y ~ X1 * X2"). If provided, willl take
            precedence over any formula found in the `model`.

        Returns
        -------
        A GLMMSpec instance.
        """

        kwargs = {}

        # Fixed terms
        model = convert_JSON(model)

        formula = formula or model.get('formula')
        if formula is not None:
            df = model_matrix(formula, df)

        kwargs['X'] = df

        # Variance components
        vcs = model.get('variance_components', [])
        Z_list = []

        if vcs:

            # VCs can be defined by variables in either the fixed predictor
            # DF or the supplementary metadata DF, so concatenate them.
            all_vars = [df, metadata] if metadata is not None else [df]
            all_vars = pd.concat(all_vars, axis=1)

            for vc in vcs:
                # Levels can either be defined by the levels of a single
                # categorical ("LevelsFrom") or by a set of binary variables.
                if 'levels_from' in vc:
                    data = all_vars[vc['levels_from']].values
                    Z_list.append(pd.get_dummies(data).values)
                else:
                    df = all_vars.loc[:, vc['levels']]
                    Z_list.append(df.values)

            Z = np.concatenate(Z_list, axis=1)
            groups = np.zeros((Z.shape[1], len(Z_list)))
            c = 0
            for i, vc in enumerate(Z_list):
                n = vc.shape[1]
                groups[c:(c+n), i] = 1
                c += n
            groups = pd.DataFrame(groups, columns=[vc['name'] for vc in vcs])

            kwargs['Z'] = Z
            kwargs['groups'] = groups

        error = model.get('error')
        if error:
            kwargs['family'] = error.get('family')
            kwargs['link'] = error.get('link')

        return GLMMSpec(**kwargs)