def test_preconditioner_subclass(): class SubArray(np.ndarray): pass rs = np.random.RandomState(0) values = rs.standard_normal((100, 10)) values = values.view(SubArray) val_cond, cond = preconditioner(values, copy=True) assert_allclose(np.sqrt((values**2).sum(0)), cond) assert type(val_cond) == type(values) # Test in-place val_cond, cond = preconditioner(values, copy=False) assert_allclose(np.sqrt((values**2).sum(0)), np.ones(10)) assert type(val_cond) == type(values)
def category_continuous_interaction( cat: AnyPandas, cont: AnyPandas, precondition: bool = True) -> sp.csc_matrix: """ Parameters ---------- cat : Series Categorical series to convert to dummy variables cont : {Series, DataFrame} Continuous variable values to use in the dummy interaction precondition : bool Flag whether dummies should be preconditioned Returns ------- csc_matrix Sparse matrix of dummy interactions with unit column norm """ codes = category_product(cat).cat.codes interact = sp.csc_matrix( (cont.to_numpy().flat, (arange(codes.shape[0]), codes))) if not precondition: return interact else: contioned = preconditioner(interact)[0] assert isinstance(contioned, sp.csc_matrix) return contioned
def test_preconditioner_sparse(): rs = np.random.RandomState(0) values = scipy.sparse.csc_matrix(rs.standard_normal((100, 10))) orig = values.copy() val_cond, cond = preconditioner(values, copy=True) assert_allclose(np.sqrt((values.multiply(values)).sum(0).A1), cond) assert id(val_cond) != id(values) assert_array_equal(orig.A, values.A)
def test_preconditioner_copy(): rs = np.random.RandomState(0) values = rs.standard_normal((100, 10)) orig = values.copy() val_cond, cond = preconditioner(values, copy=True) assert_allclose(np.sqrt((orig ** 2).sum(0)), cond) assert id(val_cond) != id(values) assert_array_equal(orig, values)
def _first_time_fit( self, use_cache: bool, lsmr_options: Optional[Dict[str, Union[float, bool]]]) -> None: weights = self.weights.ndarray if self._is_weighted else None areg = AbsorbingRegressor( cat=self._absorb_inter.cat, cont=self._absorb_inter.cont, interactions=self._interaction_list, weights=weights, ) areg_constant = areg.has_constant self._regressors = preconditioner(areg.regressors)[0] self._num_params += areg.approx_rank # Do not double count intercept-like terms self._has_constant = self._has_constant_exog or areg_constant self._num_params -= min(self._has_constant_exog, areg_constant) self._regressors_hash = areg.hash self._constant_absorbed = self._has_constant_exog and areg_constant dep = self._dependent.ndarray exog = self._exog.ndarray root_w = sqrt(self._weight_data.ndarray) dep = root_w * dep exog = root_w * exog denom = root_w.T @ root_w mu_dep = (root_w.T @ dep) / denom mu_exog = (root_w.T @ exog) / denom lsmr_options = {} if lsmr_options is None else lsmr_options assert isinstance(self._regressors, csc_matrix) if self._regressors.shape[1] > 0: dep_resid = lsmr_annihilate(self._regressors, dep, use_cache, self._regressors_hash, **lsmr_options) exog_resid = lsmr_annihilate(self._regressors, exog, use_cache, self._regressors_hash, **lsmr_options) else: dep_resid = dep exog_resid = exog if self._constant_absorbed: dep_resid += root_w * mu_dep exog_resid += root_w * mu_exog self._absorbed_dependent = DataFrame( dep_resid, index=self._dependent.pandas.index, columns=self._dependent.pandas.columns, ) self._absorbed_exog = DataFrame(exog_resid, index=self._exog.pandas.index, columns=self._exog.pandas.columns)
def _first_time_fit( self, use_cache: bool, absorb_options: Optional[Dict[str, Union[bool, str, ArrayLike, None, Dict[str, Any]]]], method: str, ) -> None: weights = (cast(Float64Array, self.weights.ndarray) if self._is_weighted else None) use_hdfe = weights is None and method in ("auto", "hdfe") use_hdfe = use_hdfe and not self._absorb_inter.cont.shape[1] use_hdfe = use_hdfe and not self._interaction_list if not use_hdfe and method == "hdfe": raise RuntimeError( "HDFE has been set as the method but the model cannot be estimated " "using HDFE. HDFE requires that the model is unweighted and that the " "absorbed regressors include only fixed effects (dummy variables)." ) areg = AbsorbingRegressor( cat=self._absorb_inter.cat, cont=self._absorb_inter.cont, interactions=self._interaction_list, weights=weights, ) areg_constant = areg.has_constant self._regressors = areg.regressors self._num_params += areg.approx_rank # Do not double count intercept-like terms self._has_constant = self._has_constant_exog or areg_constant self._num_params -= min(self._has_constant_exog, areg_constant) self._regressors_hash = areg.hash self._constant_absorbed = self._has_constant_exog and areg_constant dep = self._dependent.ndarray exog = cast(Float64Array, self._exog.ndarray) root_w = sqrt(self._weight_data.ndarray) dep = root_w * dep exog = root_w * exog denom = root_w.T @ root_w mu_dep = (root_w.T @ dep) / denom mu_exog = (root_w.T @ exog) / denom absorb_options = {} if absorb_options is None else absorb_options assert isinstance(self._regressors, sp.csc_matrix) if self._regressors.shape[1] > 0: if use_hdfe: from pyhdfe import create absorb_options["drop_singletons"] = False algo = create(self._absorb_inter.cat, **absorb_options) dep_exog = column_stack((dep, exog)) resids = algo.residualize(dep_exog) dep_resid = resids[:, :1] exog_resid = resids[:, 1:] else: self._regressors = preconditioner(self._regressors)[0] dep_exog = column_stack((dep, exog)) resid = lsmr_annihilate( self._regressors, dep_exog, use_cache, self._regressors_hash, **absorb_options, ) dep_resid = resid[:, :1] exog_resid = resid[:, 1:] else: dep_resid = dep exog_resid = exog if self._constant_absorbed: dep_resid += root_w * mu_dep exog_resid += root_w * mu_exog if not self._drop_absorbed: check_absorbed(exog_resid, self.exog.cols, exog) else: ncol = exog_resid.shape[1] retain = not_absorbed(exog_resid) if not retain: raise ValueError( "All columns in exog have been fully absorbed by the " "included effects. This model cannot be estimated.") elif len(retain) < ncol: drop = set(range(ncol)).difference(retain) dropped = ", ".join([str(self.exog.cols[i]) for i in drop]) warnings.warn( absorbing_warn_msg.format(absorbed_variables=dropped), AbsorbingEffectWarning, ) exog_resid = exog_resid[:, retain] self._columns = [self._columns[i] for i in retain] self._absorbed_dependent = DataFrame( dep_resid, index=self._dependent.pandas.index, columns=self._dependent.pandas.columns, ) self._absorbed_exog = DataFrame(exog_resid, index=self._exog.pandas.index, columns=self._columns)
def _first_time_fit( self, use_cache: bool, lsmr_options: Optional[Dict[str, Union[float, bool]]]) -> None: weights = self.weights.ndarray if self._is_weighted else None areg = AbsorbingRegressor( cat=self._absorb_inter.cat, cont=self._absorb_inter.cont, interactions=self._interaction_list, weights=weights, ) areg_constant = areg.has_constant self._regressors = preconditioner(areg.regressors)[0] self._num_params += areg.approx_rank # Do not double count intercept-like terms self._has_constant = self._has_constant_exog or areg_constant self._num_params -= min(self._has_constant_exog, areg_constant) self._regressors_hash = areg.hash self._constant_absorbed = self._has_constant_exog and areg_constant dep = self._dependent.ndarray exog = self._exog.ndarray root_w = sqrt(self._weight_data.ndarray) dep = root_w * dep exog = root_w * exog denom = root_w.T @ root_w mu_dep = (root_w.T @ dep) / denom mu_exog = (root_w.T @ exog) / denom lsmr_options = {} if lsmr_options is None else lsmr_options assert isinstance(self._regressors, sp.csc_matrix) if self._regressors.shape[1] > 0: dep_resid = lsmr_annihilate(self._regressors, dep, use_cache, self._regressors_hash, **lsmr_options) exog_resid = lsmr_annihilate(self._regressors, exog, use_cache, self._regressors_hash, **lsmr_options) else: dep_resid = dep exog_resid = exog if self._constant_absorbed: dep_resid += root_w * mu_dep exog_resid += root_w * mu_exog if not self._drop_absorbed: check_absorbed(exog_resid, self.exog.cols, exog) else: ncol = exog_resid.shape[1] retain = not_absorbed(exog_resid) if not retain: raise ValueError( "All columns in exog have been fully absorbed by the " "included effects. This model cannot be estimated.") elif len(retain) < ncol: drop = set(range(ncol)).difference(retain) dropped = ", ".join([str(self.exog.cols[i]) for i in drop]) warnings.warn( absorbing_warn_msg.format(absorbed_variables=dropped), AbsorbingEffectWarning, ) exog_resid = exog_resid[:, retain] self._columns = [self._columns[i] for i in retain] self._absorbed_dependent = DataFrame( dep_resid, index=self._dependent.pandas.index, columns=self._dependent.pandas.columns, ) self._absorbed_exog = DataFrame(exog_resid, index=self._exog.pandas.index, columns=self._columns)