def test_drop_missing(data): p = data.portfolios if isinstance(p, pd.DataFrame): p.iloc[::33] = np.nan else: p[::33] = np.nan res = TradedFactorModel(p, data.factors).fit() p = IVData(p) f = IVData(data.factors) isnull = p.isnull | f.isnull p.drop(isnull) f.drop(isnull) res2 = TradedFactorModel(p, f).fit() assert_equal(np.asarray(res.params), np.asarray(res2.params))
class Interaction(object): """ Class that simplifies specifying interactions Parameters ---------- cat : {ndarray, Series, DataFrame, DataArray}, optional Variables to treat as categoricals. Best format is a Categorical Series or DataFrame containing Categorical Series. Other formats are converted to Categorical Series, column-by-column. cats has shape (nobs, ncat). cont : {ndarray, Series, DataFrame, DataArray}, optional Variables to treat as continuous, (nobs, ncont). Notes ----- For each variable in `cont`, computes the interaction of the variable and the cartesian product of the categories. Examples -------- >>> import numpy as np >>> from linearmodels.iv.absorbing import Interaction >>> rs = np.random.RandomState(0) >>> n = 100000 >>> cats = rs.randint(2, size=n) # binary dummy >>> cont = rs.standard_normal((n, 3)) >>> interact = Interaction(cats, cont) >>> interact.sparse.shape # Get the shape of the dummy matrix (100000, 6) >>> rs = np.random.RandomState(0) >>> import pandas as pd >>> cats_df = pd.concat([pd.Series(pd.Categorical(rs.randint(5,size=n))) ... for _ in range(4)],1) >>> cats_df.describe() 0 1 2 3 count 100000 100000 100000 100000 unique 5 5 5 5 top 3 3 0 4 freq 20251 20195 20331 20158 >>> interact = Interaction(cats, cont) >>> interact.sparse.shape # Cart product of all cats, 5**4, times ncont, 3 (100000, 1875) """ _iv_data = IVData(None, "none", 1) def __init__( self, cat: OptionalArrayLike = None, cont: OptionalArrayLike = None, nobs: Optional[int] = None, ) -> None: self._cat = cat self._cont = cont self._cat_data = self._iv_data self._cont_data = self._iv_data self._nobs = nobs self._check_data() @property def nobs(self) -> int: assert self._nobs is not None return self._nobs def _check_data(self) -> None: cat, cont = self._cat, self._cont cat_nobs = getattr(cat, "shape", (0, ))[0] cont_nobs = getattr(cont, "shape", (0, ))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: self._cont_data = self._cat_data = IVData(None, "none", nobs=self._nobs) else: raise ValueError( "nobs must be provided when cat and cont are None") return self._nobs = nobs self._cat_data = IVData(cat, "cat", nobs=nobs, convert_dummies=False) self._cont_data = IVData(cont, "cont", nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: raise ValueError("Both cat and cont are empty arrays") cat_data = self._cat_data.pandas convert = [ col for col in cat_data if not (is_categorical_dtype(cat_data[col])) ] if convert: cat_data = DataFrame( {col: cat_data[col].astype("category") for col in cat_data}) self._cat_data = IVData(cat_data, "cat", convert_dummies=False) @property def cat(self) -> DataFrame: """Categorical Variables""" return self._cat_data.pandas @property def cont(self) -> DataFrame: """Continuous Variables""" return self._cont_data.pandas @property def isnull(self) -> Series: return self.cat.isnull().any(1) | self.cont.isnull().any(1) def drop(self, locs: BoolArray) -> None: self._cat_data.drop(locs) self._cont_data.drop(locs) @property def sparse(self) -> sp.csc_matrix: r""" Construct a sparse interaction matrix Returns ------- csc_matrix Dummy interaction constructed from the cartesian product of the categories and each of the continuous variables. Notes ----- The number of columns in `dummy_interact` is .. math:: ncont \times \prod_{i=1}^{ncat} |c_i| where :math:`|c_i|` is the number distinct categories in column i. """ if self.cat.shape[1] and self.cont.shape[1]: out = [] for col in self.cont: out.append( category_continuous_interaction(self.cat, self.cont[col], precondition=False)) return sp.hstack(out, format="csc") elif self.cat.shape[1]: return category_interaction(category_product(self.cat), precondition=False) elif self.cont.shape[1]: return sp.csc_matrix(self._cont_data.ndarray) else: # empty interaction return sp.csc_matrix(empty((self._cat_data.shape[0], 0))) @property def hash(self) -> List[Tuple[str, ...]]: """ Construct a hash that will be invariant for any permutation of inputs that produce the same fit when used as regressors""" # Sorted hashes of any categoricals hasher = hash_func() cat_hashes = [] cat = self.cat for col in cat: hasher.update( ascontiguousarray(self.cat[col].cat.codes.to_numpy().data)) cat_hashes.append(hasher.hexdigest()) hasher = _reset(hasher) sorted_hashes = tuple(sorted(cat_hashes)) hashes = [] cont = self.cont for col in cont: hasher.update(ascontiguousarray(cont[col].to_numpy()).data) hashes.append(sorted_hashes + (hasher.hexdigest(), )) hasher = _reset(hasher) return sorted(hashes) @staticmethod def from_frame(frame: DataFrame) -> Interaction: """ Convenience function the simplifies using a DataFrame Parameters ---------- frame : DataFrame Frame containing categorical and continuous variables. All categorical variables are passed to `cat` and all other variables are passed as `cont`. Returns ------- Interaction Instance using the columns of frame Examples -------- >>> import numpy as np >>> from linearmodels.iv.absorbing import Interaction >>> import pandas as pd >>> rs = np.random.RandomState(0) >>> n = 100000 >>> cats = pd.concat([pd.Series(pd.Categorical(rs.randint(i+2,size=n))) ... for i in range(4)],1) >>> cats.columns = ['cat{0}'.format(i) for i in range(4)] >>> columns = ['cont{0}'.format(i) for i in range(6)] >>> cont = pd.DataFrame(rs.standard_normal((n, 6)), columns=columns) >>> frame = pd.concat([cats, cont], 1) >>> interact = Interaction.from_frame(frame) >>> interact.sparse.shape # Cart product of all cats, 5!, times ncont, 6 (100000, 720) """ cat_cols = [col for col in frame if is_categorical_dtype(frame[col])] cont_cols = [col for col in frame if col not in cat_cols] return Interaction(frame[cat_cols], frame[cont_cols], nobs=frame.shape[0])
class IVLIML(object): r""" Limited information ML and k-class estimation of IV models Parameters ---------- dependent : array_like Endogenous variables (nobs by 1) exog : array_like Exogenous regressors (nobs by nexog) endog : array_like Endogenous regressors (nobs by nendog) instruments : array_like Instrumental variables (nobs by ninstr) weights : array_like, default None Observation weights used in estimation fuller : float, default 0 Fuller's alpha to modify LIML estimator. Default returns unmodified LIML estimator. kappa : float, default None Parameter value for k-class estimation. If None, computed to produce LIML parameter estimate. Notes ----- ``kappa`` and ``fuller`` should not be used simultaneously since Fuller's alpha applies an adjustment to ``kappa``, and so the same result can be computed using only ``kappa``. Fuller's alpha is used to adjust the LIML estimate of :math:`\kappa`, which is computed whenever ``kappa`` is not provided. The LIML estimator is defined as .. math:: \hat{\beta}_{\kappa} & =(X(I-\kappa M_{z})X)^{-1}X(I-\kappa M_{z})Y\\ M_{z} & =I-P_{z}\\ P_{z} & =Z(Z'Z)^{-1}Z' where :math:`Z` contains both the exogenous regressors and the instruments. :math:`\kappa` is estimated as part of the LIML estimator. When using Fuller's :math:`\alpha`, the value used is modified to .. math:: \kappa-\alpha/(n-n_{instr}) .. todo:: * VCV: bootstrap See Also -------- IV2SLS, IVGMM, IVGMMCUE """ def __init__( self, dependent: IVDataLike, exog: Optional[IVDataLike], endog: Optional[IVDataLike], instruments: Optional[IVDataLike], *, weights: Optional[IVDataLike] = None, fuller: Numeric = 0, kappa: OptionalNumeric = None, ): self.dependent = IVData(dependent, var_name="dependent") nobs: int = self.dependent.shape[0] self.exog = IVData(exog, var_name="exog", nobs=nobs) self.endog = IVData(endog, var_name="endog", nobs=nobs) self.instruments = IVData(instruments, var_name="instruments", nobs=nobs) self._original_index = self.dependent.pandas.index if weights is None: weights = ones(self.dependent.shape) weights = IVData(weights).ndarray if any(weights <= 0): raise ValueError("weights must be strictly positive.") weights = weights / nanmean(weights) self.weights = IVData(weights, var_name="weights", nobs=nobs) self._drop_locs = self._drop_missing() # dependent variable w = sqrt(self.weights.ndarray) self._y = self.dependent.ndarray self._wy = self._y * w # model regressors self._x = c_[self.exog.ndarray, self.endog.ndarray] self._wx = self._x * w # first-stage regressors self._z = c_[self.exog.ndarray, self.instruments.ndarray] self._wz = self._z * w self._has_constant = False self._regressor_is_exog = array([True] * self.exog.shape[1] + [False] * self.endog.shape[1]) self._columns = self.exog.cols + self.endog.cols self._instr_columns = self.exog.cols + self.instruments.cols self._index = self.dependent.rows self._validate_inputs() if not hasattr(self, "_method"): self._method = "IV-LIML" additional = [] if fuller != 0: additional.append("fuller(alpha={0})".format(fuller)) if kappa is not None: additional.append("kappa={0}".format(kappa)) if additional: self._method += "(" + ", ".join(additional) + ")" self._result_container: IVResultType = IVResults self._kappa = kappa self._fuller = fuller if kappa is not None and not isscalar(kappa): raise ValueError("kappa must be None or a scalar") if not isscalar(fuller): raise ValueError("fuller must be None or a scalar") if kappa is not None and fuller != 0: import warnings warnings.warn( "kappa and fuller should not normally be used " "simultaneously. Identical results can be computed " "using kappa only", UserWarning, ) if endog is None and instruments is None: self._result_container = OLSResults self._method = "OLS" self._formula = "" @staticmethod def from_formula( formula: str, data: DataFrame, *, weights: Optional[IVDataLike] = None, fuller: float = 0, kappa: OptionalNumeric = None, ) -> "IVLIML": """ Parameters ---------- formula : str Patsy formula modified for the IV syntax described in the notes section data : DataFrame DataFrame containing the variables used in the formula weights : array_like, default None Observation weights used in estimation fuller : float, default 0 Fuller's alpha to modify LIML estimator. Default returns unmodified LIML estimator. kappa : float, default None Parameter value for k-class estimation. If not provided, computed to produce LIML parameter estimate. Returns ------- IVLIML Model instance Notes ----- The IV formula modifies the standard Patsy formula to include a block of the form [endog ~ instruments] which is used to indicate the list of endogenous variables and instruments. The general structure is `dependent ~ exog [endog ~ instruments]` and it must be the case that the formula expressions constructed from blocks `dependent ~ exog endog` and `dependent ~ exog instruments` are both valid Patsy formulas. A constant must be explicitly included using '1 +' if required. Examples -------- >>> import numpy as np >>> from linearmodels.datasets import wage >>> from linearmodels.iv import IVLIML >>> data = wage.load() >>> formula = 'np.log(wage) ~ 1 + exper + exper ** 2 + brthord + [educ ~ sibs]' >>> mod = IVLIML.from_formula(formula, data) """ parser = IVFormulaParser(formula, data) dep, exog, endog, instr = parser.data mod: "IVLIML" = IVLIML(dep, exog, endog, instr, weights=weights, fuller=fuller, kappa=kappa) mod.formula = formula return mod def predict( self, params: ArrayLike, *, exog: Optional[IVDataLike] = None, endog: Optional[IVDataLike] = None, data: DataFrame = None, eval_env: int = 4, ) -> DataFrame: """ Predict values for additional data Parameters ---------- params : array_like Model parameters (nvar by 1) exog : array_like Exogenous regressors (nobs by nexog) endog : array_like Endogenous regressors (nobs by nendog) data : DataFrame Values to use when making predictions from a model constructed from a formula eval_env : int Depth of use when evaluating formulas using Patsy. Returns ------- DataFrame Fitted values from supplied data and parameters Notes ----- The number of parameters must satisfy nvar = nexog + nendog. When using `exog` and `endog`, regressor matrix is constructed as `[exog, endog]` and so parameters must be aligned to this structure. The the the same structure used in model estimation. If `data` is not none, then `exog` and `endog` must be none. Predictions from models constructed using formulas can be computed using either `exog` and `endog`, which will treat these are arrays of values corresponding to the formula-processed data, or using `data` which will be processed using the formula used to construct the values corresponding to the original model specification. """ if data is not None and self.formula is None: raise ValueError("Unable to use data when the model was not " "created using a formula.") if data is not None and (exog is not None or endog is not None): raise ValueError("Predictions can only be constructed using one " "of exog/endog or data, but not both.") if exog is not None or endog is not None: exog = IVData(exog).pandas endog = IVData(endog).pandas elif data is not None: parser = IVFormulaParser(self.formula, data, eval_env=eval_env) exog = parser.exog endog = parser.endog exog_endog = concat([exog, endog], 1) x = asarray(exog_endog) params = atleast_2d(asarray(params)) if params.shape[0] == 1: params = params.T pred = DataFrame(x @ params, index=exog_endog.index, columns=["predictions"]) return pred @property def formula(self) -> str: """Formula used to create the model""" return self._formula @formula.setter def formula(self, value: str) -> None: """Formula used to create the model""" self._formula = value def _validate_inputs(self) -> None: x, z = self._x, self._z if x.shape[1] == 0: raise ValueError("Model must contain at least one regressor.") if self.instruments.shape[1] < self.endog.shape[1]: raise ValueError( "The number of instruments ({0}) must be at least " "as large as the number of endogenous regressors" " ({1}).".format(self.instruments.shape[1], self.endog.shape[1])) if matrix_rank(x) < x.shape[1]: raise ValueError("regressors [exog endog] do not have full " "column rank") if matrix_rank(z) < z.shape[1]: raise ValueError("instruments [exog instruments] do not have " "full column rank") self._has_constant, self._const_loc = has_constant(x) def _drop_missing(self) -> NDArray: data = (self.dependent, self.exog, self.endog, self.instruments, self.weights) missing: NDArray = any(c_[[dh.isnull for dh in data]], 0) if any(missing): if npall(missing): raise ValueError("All observations contain missing data. " "Model cannot be estimated.") self.dependent.drop(missing) self.exog.drop(missing) self.endog.drop(missing) self.instruments.drop(missing) self.weights.drop(missing) missing_warning(missing) return missing @staticmethod def estimate_parameters(x: NDArray, y: NDArray, z: NDArray, kappa: Numeric) -> NDArray: """ Parameter estimation without error checking Parameters ---------- x : ndarray Regressor matrix (nobs by nvar) y : ndarray Regressand matrix (nobs by 1) z : ndarray Instrument matrix (nobs by ninstr) kappa : scalar Parameter value for k-class estimator Returns ------- ndarray Estimated parameters (nvar by 1) Notes ----- Exposed as a static method to facilitate estimation with other data, e.g., bootstrapped samples. Performs no error checking. """ pinvz = pinv(z) p1 = (x.T @ x) * (1 - kappa) + kappa * ((x.T @ z) @ (pinvz @ x)) p2 = (x.T @ y) * (1 - kappa) + kappa * ((x.T @ z) @ (pinvz @ y)) return inv(p1) @ p2 def _estimate_kappa(self) -> float: y, x, z = self._wy, self._wx, self._wz is_exog = self._regressor_is_exog e = c_[y, x[:, ~is_exog]] x1 = x[:, is_exog] ez = e - z @ (pinv(z) @ e) if x1.shape[1] == 0: # No exogenous regressors ex1 = e else: ex1 = e - x1 @ (pinv(x1) @ e) vpmzv_sqinv = inv_sqrth(ez.T @ ez) q = vpmzv_sqinv @ (ex1.T @ ex1) @ vpmzv_sqinv return min(eigvalsh(q)) def fit(self, *, cov_type: str = "robust", debiased: bool = False, **cov_config: Any) -> Union[OLSResults, IVResults]: """ Estimate model parameters Parameters ---------- cov_type : str, default "robust" Name of covariance estimator to use. Supported covariance estimators are: * 'unadjusted', 'homoskedastic' - Classic homoskedastic inference * 'robust', 'heteroskedastic' - Heteroskedasticity robust inference * 'kernel' - Heteroskedasticity and autocorrelation robust inference * 'cluster' - One-way cluster dependent inference. Heteroskedasticity robust debiased : bool, default False Flag indicating whether to debiased the covariance estimator using a degree of freedom adjustment. **cov_config Additional parameters to pass to covariance estimator. The list of optional parameters differ according to ``cov_type``. See the documentation of the alternative covariance estimators for the complete list of available commands. Returns ------- IVResults Results container Notes ----- Additional covariance parameters depend on specific covariance used. The see the docstring of specific covariance estimator for a list of supported options. Defaults are used if no covariance configuration is provided. See also -------- linearmodels.iv.covariance.HomoskedasticCovariance linearmodels.iv.covariance.HeteroskedasticCovariance linearmodels.iv.covariance.KernelCovariance linearmodels.iv.covariance.ClusteredCovariance """ wy, wx, wz = self._wy, self._wx, self._wz liml_kappa = self._estimate_kappa() kappa = self._kappa if kappa is not None: est_kappa = kappa else: est_kappa = liml_kappa if self._fuller != 0: nobs, ninstr = wz.shape est_kappa -= self._fuller / (nobs - ninstr) params = self.estimate_parameters(wx, wy, wz, est_kappa) cov_estimator = COVARIANCE_ESTIMATORS[cov_type] cov_config["debiased"] = debiased cov_config["kappa"] = est_kappa cov_config_copy = {k: v for k, v in cov_config.items()} if "center" in cov_config_copy: del cov_config_copy["center"] cov_estimator_inst = cov_estimator(wx, wy, wz, params, **cov_config_copy) results = {"kappa": est_kappa, "liml_kappa": liml_kappa} pe = self._post_estimation(params, cov_estimator_inst, cov_type) results.update(pe) return self._result_container(results, self) def wresids(self, params: NDArray) -> NDArray: """ Compute weighted model residuals Parameters ---------- params : ndarray Model parameters (nvar by 1) Returns ------- ndarray Weighted model residuals Notes ----- Uses weighted versions of data instead of raw data. Identical to resids if all weights are unity. """ return self._wy - self._wx @ params def resids(self, params: NDArray) -> NDArray: """ Compute model residuals Parameters ---------- params : ndarray Model parameters (nvar by 1) Returns ------- ndarray Model residuals """ return self._y - self._x @ params @property def has_constant(self) -> bool: """Flag indicating the model includes a constant or equivalent""" return self._has_constant @property def isnull(self) -> NDArray: """Locations of observations with missing values""" return self._drop_locs @property def notnull(self) -> NDArray: """Locations of observations included in estimation""" return logical_not(self._drop_locs) def _f_statistic( self, params: NDArray, cov: NDArray, debiased: bool) -> Union[WaldTestStatistic, InvalidTestStatistic]: const_loc = find_constant(self._x) nobs, nvar = self._x.shape return f_statistic(params, cov, debiased, nobs - nvar, const_loc) def _post_estimation(self, params: NDArray, cov_estimator: CovarianceEstimator, cov_type: str) -> Dict[str, Any]: columns = self._columns index = self._index eps = self.resids(params) y = self.dependent.pandas fitted = DataFrame(asarray(y) - eps, y.index, ["fitted_values"]) weps = self.wresids(params) cov = cov_estimator.cov debiased = cov_estimator.debiased residual_ss = weps.T @ weps w = self.weights.ndarray e = self._wy if self.has_constant: e = e - sqrt(self.weights.ndarray) * average(self._y, weights=w) total_ss = float(e.T @ e) r2 = 1 - residual_ss / total_ss fstat = self._f_statistic(params, cov, debiased) out = { "params": Series(params.squeeze(), columns, name="parameter"), "eps": Series(eps.squeeze(), index=index, name="residual"), "weps": Series(weps.squeeze(), index=index, name="weighted residual"), "cov": DataFrame(cov, columns=columns, index=columns), "s2": float(cov_estimator.s2), "debiased": debiased, "residual_ss": float(residual_ss), "total_ss": float(total_ss), "r2": float(r2), "fstat": fstat, "vars": columns, "instruments": self._instr_columns, "cov_config": cov_estimator.config, "cov_type": cov_type, "method": self._method, "cov_estimator": cov_estimator, "fitted": fitted, "original_index": self._original_index, } return out
class TradedFactorModel(object): r"""Linear factor models estimator applicable to traded factors Parameters ---------- portfolios : array-like Test portfolio returns (nobs by nportfolio) factors : array-like Priced factor returns (nobs by nfactor) Notes ----- Implements both time-series estimators of risk premia, factor loadings and zero-alpha tests. The model estimated is .. math:: r_{it}^e = \alpha_i + f_t \beta_i + \epsilon_{it} where :math:`r_{it}^e` is the excess return on test portfolio i and :math:`f_t` are the traded factor returns. The model is directly tested using the estimated values :math:`\hat{\alpha}_i`. Risk premia, :math:`\lambda_i` are estimated using the sample averages of the factors, which must be excess returns on traded portfolios. """ def __init__(self, portfolios, factors): self.portfolios = IVData(portfolios, var_name='portfolio') self.factors = IVData(factors, var_name='factor') self._name = self.__class__.__name__ self._formula = None self._validate_data() def __str__(self): out = self.__class__.__name__ f, p = self.factors.shape[1], self.portfolios.shape[1] out += ' with {0} factors, {1} test portfolios'.format(f, p) return out def __repr__(self): return self.__str__() + '\nid: {0}'.format(hex(id(self))) def _drop_missing(self): data = (self.portfolios, self.factors) missing = np.any(np.c_[[dh.isnull for dh in data]], 0) if any(missing): if all(missing): raise ValueError('All observations contain missing data. ' 'Model cannot be estimated.') self.portfolios.drop(missing) self.factors.drop(missing) missing_warning(missing) return missing def _validate_data(self): p = self.portfolios.ndarray f = self.factors.ndarray if p.shape[0] != f.shape[0]: raise ValueError('The number of observations in portfolios and ' 'factors is not the same.') self._drop_missing() p = self.portfolios.ndarray f = self.factors.ndarray if has_constant(p)[0]: raise ValueError( 'portfolios must not contains a constant or equivalent.') if has_constant(f)[0]: raise ValueError( 'factors must not contain a constant or equivalent.') if matrix_rank(f) < f.shape[1]: raise ValueError( 'Model cannot be estimated. factors do not have full column rank.' ) if matrix_rank(p) < p.shape[1]: raise ValueError( 'Model cannot be estimated. portfolios do not have full column rank.' ) @property def formula(self): return self._formula @formula.setter def formula(self, value): self._formula = value @staticmethod def _prepare_data_from_formula(formula, data, portfolios): na_action = NAAction(on_NA='raise', NA_types=[]) orig_formula = formula if portfolios is not None: factors = dmatrix(formula + ' + 0', data, return_type='dataframe', NA_action=na_action) else: formula = formula.split('~') portfolios = dmatrix(formula[0].strip() + ' + 0', data, return_type='dataframe', NA_action=na_action) factors = dmatrix(formula[1].strip() + ' + 0', data, return_type='dataframe', NA_action=na_action) return factors, portfolios, orig_formula @classmethod def from_formula(cls, formula, data, *, portfolios=None): """ Parameters ---------- formula : str Patsy formula modified for the syntax described in the notes data : DataFrame DataFrame containing the variables used in the formula portfolios : array-like, optional Portfolios to be used in the model Returns ------- model : TradedFactorModel Model instance Notes ----- The formula can be used in one of two ways. The first specified only the factors and uses the data provided in ``portfolios`` as the test portfolios. The second specified the portfolio using ``+`` to separate the test portfolios and ``~`` to separate the test portfolios from the factors. Examples -------- >>> from linearmodels.datasets import french >>> from linearmodels.asset_pricing import TradedFactorModel >>> data = french.load() >>> formula = 'S1M1 + S1M5 + S3M3 + S5M1 + S5M5 ~ MktRF + SMB + HML' >>> mod = TradedFactorModel.from_formula(formula, data) Using only factors >>> portfolios = data[['S1M1', 'S1M5', 'S3M1', 'S3M5', 'S5M1', 'S5M5']] >>> formula = 'MktRF + SMB + HML' >>> mod = TradedFactorModel.from_formula(formula, data, portfolios=portfolios) """ factors, portfolios, formula = cls._prepare_data_from_formula( formula, data, portfolios) mod = cls(portfolios, factors) mod.formula = formula return mod def fit(self, cov_type='robust', debiased=True, **cov_config): """ Estimate model parameters Parameters ---------- cov_type : str, optional Name of covariance estimator debiased : bool, optional Flag indicating whether to debias the covariance estimator using a degree of freedom adjustment **cov_config : dict Additional covariance-specific options. See Notes. Returns ------- results : LinearFactorModelResults Results class with parameter estimates, covariance and test statistics Notes ----- Supported covariance estimators are: * 'robust' - Heteroskedasticity-robust covariance estimator * 'kernel' - Heteroskedasticity and Autocorrelation consistent (HAC) covariance estimator The kernel covariance estimator takes the optional arguments ``kernel``, one of 'bartlett', 'parzen' or 'qs' (quadratic spectral) and ``bandwidth`` (a positive integer). """ p = self.portfolios.ndarray f = self.factors.ndarray nportfolio = p.shape[1] nobs, nfactor = f.shape fc = np.c_[np.ones((nobs, 1)), f] rp = f.mean(0)[:, None] fe = f - f.mean(0) b = pinv(fc) @ p eps = p - fc @ b alphas = b[:1].T nloading = (nfactor + 1) * nportfolio xpxi = np.eye(nloading + nfactor) xpxi[:nloading, :nloading] = np.kron(np.eye(nportfolio), pinv(fc.T @ fc / nobs)) f_rep = np.tile(fc, (1, nportfolio)) eps_rep = np.tile(eps, (nfactor + 1, 1)) # 1 2 3 ... 25 1 2 3 ... eps_rep = eps_rep.ravel(order='F') eps_rep = np.reshape(eps_rep, (nobs, (nfactor + 1) * nportfolio), order='F') xe = f_rep * eps_rep xe = np.c_[xe, fe] if cov_type in ('robust', 'heteroskedastic'): cov_est = HeteroskedasticCovariance(xe, inv_jacobian=xpxi, center=False, debiased=debiased, df=fc.shape[1]) rp_cov_est = HeteroskedasticCovariance(fe, jacobian=np.eye(f.shape[1]), center=False, debiased=debiased, df=1) elif cov_type == 'kernel': cov_est = KernelCovariance(xe, inv_jacobian=xpxi, center=False, debiased=debiased, df=fc.shape[1], **cov_config) bw = cov_est.bandwidth _cov_config = {k: v for k, v in cov_config.items()} _cov_config['bandwidth'] = bw rp_cov_est = KernelCovariance(fe, jacobian=np.eye(f.shape[1]), center=False, debiased=debiased, df=1, **_cov_config) else: raise ValueError('Unknown cov_type: {0}'.format(cov_type)) full_vcv = cov_est.cov rp_cov = rp_cov_est.cov vcv = full_vcv[:nloading, :nloading] # Rearrange VCV order = np.reshape(np.arange((nfactor + 1) * nportfolio), (nportfolio, nfactor + 1)) order = order.T.ravel() vcv = vcv[order][:, order] # Return values alpha_vcv = vcv[:nportfolio, :nportfolio] stat = float(alphas.T @ pinv(alpha_vcv) @ alphas) jstat = WaldTestStatistic(stat, 'All alphas are 0', nportfolio, name='J-statistic') params = b.T betas = b[1:].T residual_ss = (eps**2).sum() e = p - p.mean(0)[None, :] total_ss = (e**2).sum() r2 = 1 - residual_ss / total_ss param_names = [] for portfolio in self.portfolios.cols: param_names.append('alpha-{0}'.format(portfolio)) for factor in self.factors.cols: param_names.append('beta-{0}-{1}'.format(portfolio, factor)) for factor in self.factors.cols: param_names.append('lambda-{0}'.format(factor)) res = AttrDict(params=params, cov=full_vcv, betas=betas, rp=rp, rp_cov=rp_cov, alphas=alphas, alpha_vcv=alpha_vcv, jstat=jstat, rsquared=r2, total_ss=total_ss, residual_ss=residual_ss, param_names=param_names, portfolio_names=self.portfolios.cols, factor_names=self.factors.cols, name=self._name, cov_type=cov_type, model=self, nobs=nobs, rp_names=self.factors.cols, cov_est=cov_est) return LinearFactorModelResults(res)
class _FactorModelBase(object): r""" Base class for all factor models. Parameters ---------- portfolios : array_like Test portfolio returns (nobs by nportfolio) factors : array_like Priced factor returns (nobs by nfactor) """ def __init__(self, portfolios: IVDataLike, factors: IVDataLike): self.portfolios = IVData(portfolios, var_name="portfolio") self.factors = IVData(factors, var_name="factor") self._name = self.__class__.__name__ self._formula: Optional[str] = None self._validate_data() def __str__(self) -> str: out = self.__class__.__name__ f, p = self.factors.shape[1], self.portfolios.shape[1] out += " with {0} factors, {1} test portfolios".format(f, p) return out def __repr__(self) -> str: return self.__str__() + "\nid: {0}".format(hex(id(self))) def _drop_missing(self) -> NDArray: data = (self.portfolios, self.factors) missing = cast(NDArray, np.any(np.c_[[dh.isnull for dh in data]], 0)) if any(missing): if all(missing): raise ValueError("All observations contain missing data. " "Model cannot be estimated.") self.portfolios.drop(missing) self.factors.drop(missing) missing_warning(missing) return missing def _validate_data(self) -> None: p = self.portfolios.ndarray f = self.factors.ndarray if p.shape[0] != f.shape[0]: raise ValueError("The number of observations in portfolios and " "factors is not the same.") self._drop_missing() p = self.portfolios.ndarray f = self.factors.ndarray if has_constant(p)[0]: raise ValueError("portfolios must not contains a constant or " "equivalent and must not have rank\n" "less than the dimension of the smaller shape.") if has_constant(f)[0]: raise ValueError( "factors must not contain a constant or equivalent.") if np.linalg.matrix_rank(f) < f.shape[1]: raise ValueError( "Model cannot be estimated. factors do not have full column rank." ) if p.shape[0] < (f.shape[1] + 1): raise ValueError( "Model cannot be estimated. portfolios must have factors + 1 or " "more returns to\nestimate the model parameters.") @property def formula(self) -> Optional[str]: return self._formula @formula.setter def formula(self, value: Optional[str]) -> None: self._formula = value @staticmethod def _prepare_data_from_formula( formula: str, data: DataFrame, portfolios: DataFrame) -> Tuple[DataFrame, DataFrame, str]: na_action = NAAction(on_NA="raise", NA_types=[]) orig_formula = formula if portfolios is not None: factors = dmatrix(formula + " + 0", data, return_type="dataframe", NA_action=na_action) else: formula_components = formula.split("~") portfolios = dmatrix( formula_components[0].strip() + " + 0", data, return_type="dataframe", NA_action=na_action, ) factors = dmatrix( formula_components[1].strip() + " + 0", data, return_type="dataframe", NA_action=na_action, ) return factors, portfolios, orig_formula
class IVLIML(object): r""" Limited information ML and k-class estimation of IV models Parameters ---------- dependent : array-like Endogenous variables (nobs by 1) exog : array-like Exogenous regressors (nobs by nexog) endog : array-like Endogenous regressors (nobs by nendog) instruments : array-like Instrumental variables (nobs by ninstr) weights : array-like, optional Observation weights used in estimation fuller : float, optional Fuller's alpha to modify LIML estimator. Default returns unmodified LIML estimator. kappa : float, optional Parameter value for k-class estimation. If not provided, computed to produce LIML parameter estimate. Notes ----- ``kappa`` and ``fuller`` should not be used simultaneously since Fuller's alpha applies an adjustment to ``kappa``, and so the same result can be computed using only ``kappa``. Fuller's alpha is used to adjust the LIML estimate of :math:`\kappa`, which is computed whenever ``kappa`` is not provided. The LIML estimator is defined as .. math:: \hat{\beta}_{\kappa} & =(X(I-\kappa M_{z})X)^{-1}X(I-\kappa M_{z})Y\\ M_{z} & =I-P_{z}\\ P_{z} & =Z(Z'Z)^{-1}Z' where :math:`Z` contains both the exogenous regressors and the instruments. :math:`\kappa` is estimated as part of the LIML estimator. When using Fuller's :math:`\alpha`, the value used is modified to .. math:: \kappa-\alpha/(n-n_{instr}) .. todo:: * VCV: bootstrap See Also -------- IV2SLS, IVGMM, IVGMMCUE """ def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike, endog: OptionalArrayLike, instruments: OptionalArrayLike, *, weights: OptionalArrayLike = None, fuller: Numeric = 0, kappa: OptionalNumeric = None): self.dependent = IVData(dependent, var_name='dependent') nobs = self.dependent.shape[0] # type: int self.exog = IVData(exog, var_name='exog', nobs=nobs) self.endog = IVData(endog, var_name='endog', nobs=nobs) self.instruments = IVData(instruments, var_name='instruments', nobs=nobs) self._original_index = self.dependent.pandas.index if weights is None: weights = ones(self.dependent.shape) weights = IVData(weights).ndarray if any(weights <= 0): raise ValueError('weights must be strictly positive.') weights = weights / nanmean(weights) self.weights = IVData(weights, var_name='weights', nobs=nobs) self._drop_locs = self._drop_missing() # dependent variable w = sqrt(self.weights.ndarray) self._y = self.dependent.ndarray self._wy = self._y * w # model regressors self._x = c_[self.exog.ndarray, self.endog.ndarray] self._wx = self._x * w # first-stage regressors self._z = c_[self.exog.ndarray, self.instruments.ndarray] self._wz = self._z * w self._has_constant = False self._regressor_is_exog = array([True] * self.exog.shape[1] + [False] * self.endog.shape[1]) self._columns = self.exog.cols + self.endog.cols self._instr_columns = self.exog.cols + self.instruments.cols self._index = self.dependent.rows self._validate_inputs() if not hasattr(self, '_method'): self._method = 'IV-LIML' additional = [] if fuller != 0: additional.append('fuller(alpha={0})'.format(fuller)) if kappa is not None: additional.append('kappa={0}'.format(kappa)) if additional: self._method += '(' + ', '.join(additional) + ')' if not hasattr(self, '_result_container'): self._result_container = IVResults self._kappa = kappa self._fuller = fuller if kappa is not None and not isscalar(kappa): raise ValueError('kappa must be None or a scalar') if not isscalar(fuller): raise ValueError('fuller must be None or a scalar') if kappa is not None and fuller != 0: import warnings warnings.warn( 'kappa and fuller should not normally be used ' 'simultaneously. Identical results can be computed ' 'using kappa only', UserWarning) if endog is None and instruments is None: self._result_container = OLSResults self._method = 'OLS' self._formula = None @staticmethod def from_formula(formula, data, *, weights=None, fuller=0, kappa=None): """ Parameters ---------- formula : str Patsy formula modified for the IV syntax described in the notes section data : DataFrame DataFrame containing the variables used in the formula weights : array-like, optional Observation weights used in estimation fuller : float, optional Fuller's alpha to modify LIML estimator. Default returns unmodified LIML estimator. kappa : float, optional Parameter value for k-class estimation. If not provided, computed to produce LIML parameter estimate. Returns ------- model : IVLIML Model instance Notes ----- The IV formula modifies the standard Patsy formula to include a block of the form [endog ~ instruments] which is used to indicate the list of endogenous variables and instruments. The general structure is `dependent ~ exog [endog ~ instruments]` and it must be the case that the formula expressions constructed from blocks `dependent ~ exog endog` and `dependent ~ exog instruments` are both valid Patsy formulas. A constant must be explicitly included using '1 +' if required. Examples -------- >>> import numpy as np >>> from linearmodels.datasets import wage >>> from linearmodels.iv import IVLIML >>> data = wage.load() >>> formula = 'np.log(wage) ~ 1 + exper + exper ** 2 + brthord + [educ ~ sibs]' >>> mod = IVLIML.from_formula(formula, data) """ parser = IVFormulaParser(formula, data) dep, exog, endog, instr = parser.data mod = IVLIML(dep, exog, endog, instr, weights=weights, fuller=fuller, kappa=kappa) mod.formula = formula return mod def predict(self, params, *, exog=None, endog=None, data=None, eval_env=4): """ Predict values for additional data Parameters ---------- params : array-like Model parameters (nvar by 1) exog : array-like Exogenous regressors (nobs by nexog) endog : array-like Endogenous regressors (nobs by nendog) data : DataFrame Values to use when making predictions from a model constructed from a formula eval_env : int Depth of use when evaluating formulas using Patsy. Returns ------- predictions : DataFrame Fitted values from supplied data and parameters Notes ----- The number of parameters must satisfy nvar = nexog + nendog. When using `exog` and `endog`, regressor matrix is constructed as `[exog, endog]` and so parameters must be aligned to this structure. The the the same structure used in model estimation. If `data` is not none, then `exog` and `endog` must be none. Predictions from models constructed using formulas can be computed using either `exog` and `endog`, which will treat these are arrays of values corresponding to the formula-processed data, or using `data` which will be processed using the formula used to construct the values corresponding to the original model specification. """ if data is not None and self.formula is None: raise ValueError('Unable to use data when the model was not ' 'created using a formula.') if data is not None and (exog is not None or endog is not None): raise ValueError('Predictions can only be constructed using one ' 'of exog/endog or data, but not both.') if exog is not None or endog is not None: exog = IVData(exog).pandas endog = IVData(endog).pandas else: parser = IVFormulaParser(self.formula, data, eval_env=eval_env) exog = parser.exog endog = parser.endog exog_endog = concat([exog, endog], 1) x = asarray(exog_endog) params = atleast_2d(asarray(params)) if params.shape[0] == 1: params = params.T pred = DataFrame(x @ params, index=exog_endog.index, columns=['predictions']) return pred @property def formula(self): """Formula used to create the model""" return self._formula @formula.setter def formula(self, value): """Formula used to create the model""" self._formula = value def _validate_inputs(self): x, z = self._x, self._z if x.shape[1] == 0: raise ValueError('Model must contain at least one regressor.') if self.instruments.shape[1] < self.endog.shape[1]: raise ValueError( 'The number of instruments ({0}) must be at least ' 'as large as the number of endogenous regressors' ' ({1}).'.format(self.instruments.shape[1], self.endog.shape[1])) if matrix_rank(x) < x.shape[1]: raise ValueError('regressors [exog endog] do not have full ' 'column rank') if matrix_rank(z) < z.shape[1]: raise ValueError('instruments [exog instruments] do not have ' 'full column rank') self._has_constant, self._const_loc = has_constant(x) def _drop_missing(self): data = (self.dependent, self.exog, self.endog, self.instruments, self.weights) missing = any(c_[[dh.isnull for dh in data]], 0) if any(missing): if all(missing): raise ValueError('All observations contain missing data. ' 'Model cannot be estimated.') self.dependent.drop(missing) self.exog.drop(missing) self.endog.drop(missing) self.instruments.drop(missing) self.weights.drop(missing) missing_warning(missing) return missing @staticmethod def estimate_parameters(x, y, z, kappa): """ Parameter estimation without error checking Parameters ---------- x : ndarray Regressor matrix (nobs by nvar) y : ndarray Regressand matrix (nobs by 1) z : ndarray Instrument matrix (nobs by ninstr) kappa : scalar Parameter value for k-class estimator Returns ------- params : ndarray Estimated parameters (nvar by 1) Notes ----- Exposed as a static method to facilitate estimation with other data, e.g., bootstrapped samples. Performs no error checking. """ pinvz = pinv(z) p1 = (x.T @ x) * (1 - kappa) + kappa * ((x.T @ z) @ (pinvz @ x)) p2 = (x.T @ y) * (1 - kappa) + kappa * ((x.T @ z) @ (pinvz @ y)) return inv(p1) @ p2 def _estimate_kappa(self): y, x, z = self._wy, self._wx, self._wz is_exog = self._regressor_is_exog e = c_[y, x[:, ~is_exog]] x1 = x[:, is_exog] ez = e - z @ (pinv(z) @ e) if x1.shape[1] == 0: # No exogenous regressors ex1 = e else: ex1 = e - x1 @ (pinv(x1) @ e) vpmzv_sqinv = inv_sqrth(ez.T @ ez) q = vpmzv_sqinv @ (ex1.T @ ex1) @ vpmzv_sqinv return min(eigvalsh(q)) def fit(self, *, cov_type='robust', debiased=False, **cov_config): """ Estimate model parameters Parameters ---------- cov_type : str, optional Name of covariance estimator to use. Supported covariance estimators are: * 'unadjusted', 'homoskedastic' - Classic homoskedastic inference * 'robust', 'heteroskedastic' - Heteroskedasticity robust inference * 'kernel' - Heteroskedasticity and autocorrelation robust inference * 'cluster' - One-way cluster dependent inference. Heteroskedasticity robust debiased : bool, optional Flag indicating whether to debiased the covariance estimator using a degree of freedom adjustment. **cov_config Additional parameters to pass to covariance estimator. The list of optional parameters differ according to ``cov_type``. See the documentation of the alternative covariance estimators for the complete list of available commands. Returns ------- results : IVResults Results container Notes ----- Additional covariance parameters depend on specific covariance used. The see the docstring of specific covariance estimator for a list of supported options. Defaults are used if no covariance configuration is provided. See also -------- linearmodels.iv.covariance.HomoskedasticCovariance linearmodels.iv.covariance.HeteroskedasticCovariance linearmodels.iv.covariance.KernelCovariance linearmodels.iv.covariance.ClusteredCovariance """ wy, wx, wz = self._wy, self._wx, self._wz liml_kappa = self._estimate_kappa() kappa = self._kappa if kappa is None: kappa = liml_kappa if self._fuller != 0: nobs, ninstr = wz.shape kappa -= self._fuller / (nobs - ninstr) params = self.estimate_parameters(wx, wy, wz, kappa) cov_estimator = COVARIANCE_ESTIMATORS[cov_type] cov_config['debiased'] = debiased cov_config['kappa'] = kappa cov_config_copy = {k: v for k, v in cov_config.items()} if 'center' in cov_config_copy: del cov_config_copy['center'] cov_estimator = cov_estimator(wx, wy, wz, params, **cov_config_copy) results = {'kappa': kappa, 'liml_kappa': liml_kappa} pe = self._post_estimation(params, cov_estimator, cov_type) results.update(pe) return self._result_container(results, self) def wresids(self, params): """ Compute weighted model residuals Parameters ---------- params : ndarray Model parameters (nvar by 1) Returns ------- wresids : ndarray Weighted model residuals Notes ----- Uses weighted versions of data instead of raw data. Identical to resids if all weights are unity. """ return self._wy - self._wx @ params def resids(self, params): """ Compute model residuals Parameters ---------- params : ndarray Model parameters (nvar by 1) Returns ------- resids : ndarray Model residuals """ return self._y - self._x @ params @property def has_constant(self): """Flag indicating the model includes a constant or equivalent""" return self._has_constant @property def isnull(self): """Locations of observations with missing values""" return self._drop_locs @property def notnull(self): """Locations of observations included in estimation""" return logical_not(self._drop_locs) def _f_statistic(self, params, cov, debiased): non_const = ~(ptp(self._x, 0) == 0) test_params = params[non_const] test_cov = cov[non_const][:, non_const] test_stat = test_params.T @ inv(test_cov) @ test_params test_stat = float(test_stat) nobs, nvar = self._x.shape null = 'All parameters ex. constant are zero' name = 'Model F-statistic' df = test_params.shape[0] if debiased: wald = WaldTestStatistic(test_stat / df, null, df, nobs - nvar, name=name) else: wald = WaldTestStatistic(test_stat, null, df, name=name) return wald def _post_estimation(self, params, cov_estimator, cov_type): columns = self._columns index = self._index eps = self.resids(params) y = self.dependent.pandas fitted = DataFrame(asarray(y) - eps, y.index, ['fitted_values']) weps = self.wresids(params) cov = cov_estimator.cov debiased = cov_estimator.debiased residual_ss = (weps.T @ weps) w = self.weights.ndarray e = self._wy if self.has_constant: e = e - sqrt(self.weights.ndarray) * average(self._y, weights=w) total_ss = float(e.T @ e) r2 = 1 - residual_ss / total_ss fstat = self._f_statistic(params, cov, debiased) out = { 'params': Series(params.squeeze(), columns, name='parameter'), 'eps': Series(eps.squeeze(), index=index, name='residual'), 'weps': Series(weps.squeeze(), index=index, name='weighted residual'), 'cov': DataFrame(cov, columns=columns, index=columns), 's2': float(cov_estimator.s2), 'debiased': debiased, 'residual_ss': float(residual_ss), 'total_ss': float(total_ss), 'r2': float(r2), 'fstat': fstat, 'vars': columns, 'instruments': self._instr_columns, 'cov_config': cov_estimator.config, 'cov_type': cov_type, 'method': self._method, 'cov_estimator': cov_estimator, 'fitted': fitted, 'original_index': self._original_index } return out