def _build_absorb(self, ids: Array) -> 'Absorb': """Build a function used to absorb fixed effects defined by columns of IDs.""" import pyhdfe return Absorb( pyhdfe.create(ids, drop_singletons=False, compute_degrees=False, residualize_method=self._absorb_method, options=self._absorb_options))
def __init__(self, df, target, predictors, ids, cluster_ids=[], drop_singletons=True): """ Args: target (string): name of target variable predictors (list of strings): names of predictors ids (list of strings): names of variables to be absorbed df (pandas Dataframe): dataframe containing referenced data which includes target, predictors and ids """ self.df = df self.algo = pyhdfe.create(ids=get_np_columns(df, ids), cluster_ids=get_np_columns(df, cluster_ids), drop_singletons=drop_singletons, degrees_method='pairwise') self.all_names = [target] + predictors self.residualized = self.algo.residualize( get_np_columns(df, [target] + predictors + cluster_ids)) self.formula = target + '~' + predictors[0] for name in predictors[1:]: self.formula = self.formula + '+' + name self.formula = self.formula + '-1' df_residualized = pd.DataFrame() for i, name in enumerate(self.all_names): df_residualized[name] = self.residualized[:, i] y, X = dmatrices(self.formula, data=df_residualized, return_type='dataframe') self.model = sm.OLS(y, X) if cluster_ids == []: self.model.df_resid = self.residualized.shape[0] - len( predictors) - self.algo.degrees else: clusters = get_np_columns( df, cluster_ids)[~self.algo._singleton_indices] min_cluster_count = np.unique(clusters[:, 0]).shape[0] for i in range(1, clusters.shape[1]): current_count = np.unique(clusters[:, i]).shape[0] if current_count < min_cluster_count: min_cluster_count = current_count self.model.df_resid = min_cluster_count - len(predictors)
def _build_absorb( self, ids: Array) -> Callable[[Array], Tuple[Array, List[Error]]]: """Build a function used to absorb fixed effects defined by columns of IDs.""" import pyhdfe # initialize the algorithm for repeated absorption algorithm = pyhdfe.create(ids, drop_singletons=False, compute_degrees=False, residualize_method=self._absorb_method, options=self._absorb_options) def absorb(matrix: Array) -> Tuple[Array, List[Error]]: """Handle any absorption errors.""" errors: List[Error] = [] try: matrix = algorithm.residualize(matrix) except Exception as exception: errors.append(exceptions.AbsorptionError(exception)) return matrix, errors return absorb
#del df_kappas, df_firm # %% #fe_var_cs = variables[['quarter']] variables = df_merge_nas[[ 'kappa', 'retail_share', 'lcap', 'marginsq', 'normalized_l2', 'big3', 'blackrock', 'vanguard', 'statestreet' ]] fe_var_cs = df_merge_nas[['quarter_fe']] fe_var_ts = df_merge_nas[['pair']] fe_var_pa = df_merge_nas[['pair', 'quarter_fe']] # keep even singletons; deal with them in the regression tick() alg_cs = pyhdfe.create(fe_var_cs, drop_singletons=False) tock() alg_ts = pyhdfe.create(fe_var_ts, drop_singletons=False) tock() alg_pa = pyhdfe.create(fe_var_pa, drop_singletons=False) tock() tick() resid_cs = alg_cs.residualize(variables) tock() resid_ts = alg_ts.residualize(variables) tock() resid_pa = alg_pa.residualize(variables) tock() # %%
# million dates df['pair_fe'] = df.groupby(['from', 'to']).ngroup() df['quarter_fe'] = df.groupby(['quarter']).ngroup() # Regressions! # We will need to absorb: do that first # This is comically slow and uses 30+GB var_list = [ 'kappa', 'retail_share', 'lcap', 'marginsq', 'normalized_l2', 'big3', 'blackrock', 'vanguard', 'statestreet' ] # Drop any missings df2 = df[var_list + ['pair_fe', 'quarter_fe']].dropna() alg_pa = pyhdfe.create(df2[['pair_fe', 'quarter_fe']].values, drop_singletons=False) resid_pa = alg_pa.residualize(df2[var_list].values) # Perform Regressions # no need for fixed effects because we've already residualized everything # drop rows containing NAs pd_vars = pd.DataFrame(resid_pa, columns=[ 'kappa', 'retail_share', 'lcap', 'marginsq', 'normalized_l2', 'big3', 'blackrock', 'vanguard', 'statestreet' ]) reg1 = smf.ols(formula='kappa ~ retail_share + lcap + marginsq + big3', data=pd_vars).fit() reg2 = smf.ols(
def _first_time_fit( self, use_cache: bool, absorb_options: Optional[Dict[str, Union[bool, str, ArrayLike, None, Dict[str, Any]]]], method: str, ) -> None: weights = (cast(Float64Array, self.weights.ndarray) if self._is_weighted else None) use_hdfe = weights is None and method in ("auto", "hdfe") use_hdfe = use_hdfe and not self._absorb_inter.cont.shape[1] use_hdfe = use_hdfe and not self._interaction_list if not use_hdfe and method == "hdfe": raise RuntimeError( "HDFE has been set as the method but the model cannot be estimated " "using HDFE. HDFE requires that the model is unweighted and that the " "absorbed regressors include only fixed effects (dummy variables)." ) areg = AbsorbingRegressor( cat=self._absorb_inter.cat, cont=self._absorb_inter.cont, interactions=self._interaction_list, weights=weights, ) areg_constant = areg.has_constant self._regressors = areg.regressors self._num_params += areg.approx_rank # Do not double count intercept-like terms self._has_constant = self._has_constant_exog or areg_constant self._num_params -= min(self._has_constant_exog, areg_constant) self._regressors_hash = areg.hash self._constant_absorbed = self._has_constant_exog and areg_constant dep = self._dependent.ndarray exog = cast(Float64Array, self._exog.ndarray) root_w = sqrt(self._weight_data.ndarray) dep = root_w * dep exog = root_w * exog denom = root_w.T @ root_w mu_dep = (root_w.T @ dep) / denom mu_exog = (root_w.T @ exog) / denom absorb_options = {} if absorb_options is None else absorb_options assert isinstance(self._regressors, sp.csc_matrix) if self._regressors.shape[1] > 0: if use_hdfe: from pyhdfe import create absorb_options["drop_singletons"] = False algo = create(self._absorb_inter.cat, **absorb_options) dep_exog = column_stack((dep, exog)) resids = algo.residualize(dep_exog) dep_resid = resids[:, :1] exog_resid = resids[:, 1:] else: self._regressors = preconditioner(self._regressors)[0] dep_exog = column_stack((dep, exog)) resid = lsmr_annihilate( self._regressors, dep_exog, use_cache, self._regressors_hash, **absorb_options, ) dep_resid = resid[:, :1] exog_resid = resid[:, 1:] else: dep_resid = dep exog_resid = exog if self._constant_absorbed: dep_resid += root_w * mu_dep exog_resid += root_w * mu_exog if not self._drop_absorbed: check_absorbed(exog_resid, self.exog.cols, exog) else: ncol = exog_resid.shape[1] retain = not_absorbed(exog_resid) if not retain: raise ValueError( "All columns in exog have been fully absorbed by the " "included effects. This model cannot be estimated.") elif len(retain) < ncol: drop = set(range(ncol)).difference(retain) dropped = ", ".join([str(self.exog.cols[i]) for i in drop]) warnings.warn( absorbing_warn_msg.format(absorbed_variables=dropped), AbsorbingEffectWarning, ) exog_resid = exog_resid[:, retain] self._columns = [self._columns[i] for i in retain] self._absorbed_dependent = DataFrame( dep_resid, index=self._dependent.pandas.index, columns=self._dependent.pandas.columns, ) self._absorbed_exog = DataFrame(exog_resid, index=self._exog.pandas.index, columns=self._columns)
# ######################################################################### PYHDFE TEST # for a in list(enumerate(list(df))): # print(a) df_np = df.to_numpy() # just a sanity check of straight forward regression #print("ln_wage ~ hours_log") #model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log'])) #results = model.fit() #print(results.summary()) algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False), degrees_method='pairwise') residualized = algo.residualize(get_np_columns(df, ['ln_wage', 'hours_log'], False)) print(algo.degrees) import pdb; pdb.set_trace() #model = sm.OLS(residualized[:,0], np.ones((residualized.shape[0], 1))) model = sm.OLS(residualized[:,0], add_intercept(residualized[:, 1])) #print(add_intercept(residualized[:,1])[:10]) ids = get_np_columns(df, ['idcode', 'year'], False)
df_kappas2['irat'] = df_kappas2['kappa'] / df_kappas2['cosine'] # Fixed Effects df_kappas2['pair_fe'] = df_kappas2.groupby(['from', 'to']).ngroup() df_kappas2['quarter_fe'] = df_kappas2.groupby(['quarter']).ngroup() # Report the size of everything print("N of Overall Dataframe:", len(df_kappas2)) print("N Quarter FE:", len(df_kappas2.quarter_fe.unique())) print("N Pair FE:", len(df_kappas2.pair_fe.unique())) # Take the logs of everything and get a NumPy array variables = np.log(df_kappas2[['kappa', 'cosine', 'irat']]).values # Use pyhdfe for high-dimensional fixed effects absorption # This takes 13min on my iMac resid_cs = pyhdfe.create(df_kappas2[['quarter_fe' ]].values).residualize(variables) resid_ts = pyhdfe.create(df_kappas2[['pair_fe']].values).residualize(variables) resid_pa = pyhdfe.create(df_kappas2[['pair_fe', 'quarter_fe' ]].values).residualize(variables) # Do the Variance Decomposition for each case tab_mat = np.vstack([ do_decomp(variables), do_decomp(resid_cs), do_decomp(resid_ts), do_decomp(resid_pa) ]) * 100.0 table3 = pd.DataFrame( tab_mat, index=['Raw', 'Cross-Section', 'Time-Series', 'Panel'], columns=['Overlapping Ownership', 'Relative IHHI', 'Covariance'])
def __init__(self, df, target, predictors, absorb_ids=[], cluster_ids=[], drop_singletons=True, intercept=False): """Regression wrapper for PyHDFE. Args: df (pandas Dataframe): dataframe containing referenced data which includes target, predictors and absorb and cluster. target (string): name of target variable - the y in y = X*b + e. predictors (string or list of strings): names of predictors, the X in y = X*b + e. absorb_ids (string or list of strings): names of variables to be absorbed for fixed effects. cluster_ids (string or list of strings): names of variables to be clustered on. drop_singletons (bool): indicates whether to drop singleton groups. Defaults is True, same as stata. Setting to False is equivalent to passing keepsingletons to reghdfe. """ self.df = df # in case user has not wrapped singular strings in a list if isinstance(predictors, str): predictors = [predictors] if isinstance(absorb_ids, str): absorb_ids = [absorb_ids] if isinstance(cluster_ids, str): cluster_ids = [cluster_ids] self.target = target self.predictors = predictors self.absorb_ids = absorb_ids self.cluster_ids = cluster_ids self.drop_singletons = drop_singletons # names of all features involved in regression self.all_names = [target] + predictors # We construct a formula here to feed it into OLS # We do this to make the output prettier and give each coefficient # meaningful names (otherwise the regression coefficients are named x1, x2 etc.) self.formula = target + '~' + predictors[0] for name in predictors[1:]: self.formula = self.formula + '+' + name # if there's stuff to be absorbed if absorb_ids: # Intercept term is redundant in fixed effects self.formula = self.formula + '-1' self.algo = pyhdfe.create(ids=get_np_columns(df, absorb_ids), cluster_ids=get_np_columns( df, cluster_ids), drop_singletons=drop_singletons, degrees_method='pairwise') # self.residualized contains features adjusted for fixed effects # (i.e. means subtracted, singleton groups dropped etc.) self.data = self.algo.residualize( get_np_columns(df, [target] + predictors)) else: # otherwise just get np columns as is self.data = get_np_columns(df, self.all_names) if not intercept: self.formula = self.formula + '-1' df_data = pd.DataFrame() for i, name in enumerate(self.all_names): df_data[name] = self.data[:, i] y, X = dmatrices(self.formula, data=df_data, return_type='dataframe') # now We prepare the cluster groups if they exist # We do this here rather than later in fit() since it can be convenient to have # access to these groups as early as possible # if not empty self.model = sm.OLS(y, X) if bool(self.cluster_ids): # numpy array of group data self.groups_np = get_np_columns(self.df, self.cluster_ids) if bool(self.absorb_ids): # number of groups - already calculated by pyhdfe so We're just retrieving the value n_groups = self.algo._groups_list[0].group_count # get numpy representation of cluster groups # and remove singleton groups (if fixed effects were used) self.groups_np = self.groups_np[~self.algo._singleton_indices] min_cluster_count = np.unique(self.groups_np[:, 0]).shape[0] for i in range(1, self.groups_np.shape[1]): current_count = np.unique(self.groups_np[:, i]).shape[0] if current_count < min_cluster_count: min_cluster_count = current_count self.min_cluster_count = min_cluster_count self.model.df_resid = min_cluster_count - len( self.predictors) + 1 self.model._df_resid = self.model.df_resid
##print(res.summary) # ######################################################################### PYHDFE TEST for a in list(enumerate(list(df))): print(a) df_np = df.to_numpy() # just a sanity check of straight forward regression #print("ln_wage ~ hours_log") #model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log'])) #results = model.fit() #print(results.summary()) algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False)) residualized = algo.residualize( get_np_columns(df, ['ln_wage', 'hours_log'], False)) #model = sm.OLS(residualized[:,0], add_intercept(residualized[:,1])) #ln_wage ~ hours_log, absorb(year) # OLS Regression Results #============================================================================== #Dep. Variable: y R-squared: 0.005 #Model: OLS Adj. R-squared: 0.005 #Method: Least Squares F-statistic: 69.95 #Date: Sat, 05 Dec 2020 Prob (F-statistic): 6.67e-17 #Time: 13:04:23 Log-Likelihood: -8305.1 #No. Observations: 13452 AIC: 1.661e+04 #Df Residuals: 13450 BIC: 1.663e+04 #Df Model: 1
import pyhdfe from utils import add_intercept, get_np_columns from sklearn.datasets import load_boston # details about dataset can be found at https://www.kaggle.com/crawford/80-cereals df = pd.read_csv('/home/abom/Downloads/dataset_cereal/cereal.csv') print(list(df)) #results = smf.ols(formula='rating ~ fat + protein + carbo + sugars', data=df).fit() #print(results.summary()) print(get_np_columns(df, ['cups'], False)[:10]) algo = pyhdfe.create(get_np_columns(df, ['shelf'], False)) # OLS Regression Results # ============================================================================== # Dep. Variable: y R-squared: 0.759 # Model: OLS Adj. R-squared: 0.745 # Method: Least Squares F-statistic: 56.55 # Date: Mon, 07 Dec 2020 Prob (F-statistic): 1.71e-21 # Time: 09:15:25 Log-Likelihood: -252.82 # No. Observations: 77 AIC: 515.6 # Df Residuals: 72 BIC: 527.4 # Df Model: 4 # Covariance Type: nonrobust # ============================================================================== # coef std err t P>|t| [0.025 0.975] # ------------------------------------------------------------------------------