Ejemplo n.º 1
0
 def _build_absorb(self, ids: Array) -> 'Absorb':
     """Build a function used to absorb fixed effects defined by columns of IDs."""
     import pyhdfe
     return Absorb(
         pyhdfe.create(ids,
                       drop_singletons=False,
                       compute_degrees=False,
                       residualize_method=self._absorb_method,
                       options=self._absorb_options))
Ejemplo n.º 2
0
    def __init__(self,
                 df,
                 target,
                 predictors,
                 ids,
                 cluster_ids=[],
                 drop_singletons=True):
        """
        Args:
            target (string): name of target variable
            predictors (list of strings): names of predictors
            ids (list of strings): names of variables to be absorbed
            df (pandas Dataframe): dataframe containing referenced data
                                    which includes target, predictors and ids
        """
        self.df = df
        self.algo = pyhdfe.create(ids=get_np_columns(df, ids),
                                  cluster_ids=get_np_columns(df, cluster_ids),
                                  drop_singletons=drop_singletons,
                                  degrees_method='pairwise')
        self.all_names = [target] + predictors
        self.residualized = self.algo.residualize(
            get_np_columns(df, [target] + predictors + cluster_ids))
        self.formula = target + '~' + predictors[0]
        for name in predictors[1:]:
            self.formula = self.formula + '+' + name
        self.formula = self.formula + '-1'
        df_residualized = pd.DataFrame()
        for i, name in enumerate(self.all_names):
            df_residualized[name] = self.residualized[:, i]

        y, X = dmatrices(self.formula,
                         data=df_residualized,
                         return_type='dataframe')
        self.model = sm.OLS(y, X)
        if cluster_ids == []:
            self.model.df_resid = self.residualized.shape[0] - len(
                predictors) - self.algo.degrees
        else:
            clusters = get_np_columns(
                df, cluster_ids)[~self.algo._singleton_indices]
            min_cluster_count = np.unique(clusters[:, 0]).shape[0]
            for i in range(1, clusters.shape[1]):
                current_count = np.unique(clusters[:, i]).shape[0]
                if current_count < min_cluster_count:
                    min_cluster_count = current_count

            self.model.df_resid = min_cluster_count - len(predictors)
Ejemplo n.º 3
0
    def _build_absorb(
            self, ids: Array) -> Callable[[Array], Tuple[Array, List[Error]]]:
        """Build a function used to absorb fixed effects defined by columns of IDs."""
        import pyhdfe

        # initialize the algorithm for repeated absorption
        algorithm = pyhdfe.create(ids,
                                  drop_singletons=False,
                                  compute_degrees=False,
                                  residualize_method=self._absorb_method,
                                  options=self._absorb_options)

        def absorb(matrix: Array) -> Tuple[Array, List[Error]]:
            """Handle any absorption errors."""
            errors: List[Error] = []
            try:
                matrix = algorithm.residualize(matrix)
            except Exception as exception:
                errors.append(exceptions.AbsorptionError(exception))
            return matrix, errors

        return absorb
#del df_kappas, df_firm

# %%
#fe_var_cs = variables[['quarter']]
variables = df_merge_nas[[
    'kappa', 'retail_share', 'lcap', 'marginsq', 'normalized_l2', 'big3',
    'blackrock', 'vanguard', 'statestreet'
]]

fe_var_cs = df_merge_nas[['quarter_fe']]
fe_var_ts = df_merge_nas[['pair']]
fe_var_pa = df_merge_nas[['pair', 'quarter_fe']]
# keep even singletons; deal with them in the regression

tick()
alg_cs = pyhdfe.create(fe_var_cs, drop_singletons=False)
tock()
alg_ts = pyhdfe.create(fe_var_ts, drop_singletons=False)
tock()
alg_pa = pyhdfe.create(fe_var_pa, drop_singletons=False)
tock()

tick()
resid_cs = alg_cs.residualize(variables)
tock()
resid_ts = alg_ts.residualize(variables)
tock()
resid_pa = alg_pa.residualize(variables)
tock()

# %%
Ejemplo n.º 5
0
# million dates
df['pair_fe'] = df.groupby(['from', 'to']).ngroup()
df['quarter_fe'] = df.groupby(['quarter']).ngroup()

# Regressions!
# We will need to absorb: do that first
# This is comically slow and uses 30+GB
var_list = [
    'kappa', 'retail_share', 'lcap', 'marginsq', 'normalized_l2', 'big3',
    'blackrock', 'vanguard', 'statestreet'
]

# Drop any missings
df2 = df[var_list + ['pair_fe', 'quarter_fe']].dropna()

alg_pa = pyhdfe.create(df2[['pair_fe', 'quarter_fe']].values,
                       drop_singletons=False)
resid_pa = alg_pa.residualize(df2[var_list].values)

# Perform Regressions
# no need for fixed effects because we've already residualized everything
# drop rows containing NAs
pd_vars = pd.DataFrame(resid_pa,
                       columns=[
                           'kappa', 'retail_share', 'lcap', 'marginsq',
                           'normalized_l2', 'big3', 'blackrock', 'vanguard',
                           'statestreet'
                       ])

reg1 = smf.ols(formula='kappa ~ retail_share + lcap + marginsq + big3',
               data=pd_vars).fit()
reg2 = smf.ols(
Ejemplo n.º 6
0
    def _first_time_fit(
        self,
        use_cache: bool,
        absorb_options: Optional[Dict[str, Union[bool, str, ArrayLike, None,
                                                 Dict[str, Any]]]],
        method: str,
    ) -> None:
        weights = (cast(Float64Array, self.weights.ndarray)
                   if self._is_weighted else None)

        use_hdfe = weights is None and method in ("auto", "hdfe")
        use_hdfe = use_hdfe and not self._absorb_inter.cont.shape[1]
        use_hdfe = use_hdfe and not self._interaction_list

        if not use_hdfe and method == "hdfe":
            raise RuntimeError(
                "HDFE has been set as the method but the model cannot be estimated "
                "using HDFE. HDFE requires that the model is unweighted and that the "
                "absorbed regressors include only fixed effects (dummy variables)."
            )
        areg = AbsorbingRegressor(
            cat=self._absorb_inter.cat,
            cont=self._absorb_inter.cont,
            interactions=self._interaction_list,
            weights=weights,
        )
        areg_constant = areg.has_constant
        self._regressors = areg.regressors
        self._num_params += areg.approx_rank
        # Do not double count intercept-like terms
        self._has_constant = self._has_constant_exog or areg_constant
        self._num_params -= min(self._has_constant_exog, areg_constant)
        self._regressors_hash = areg.hash
        self._constant_absorbed = self._has_constant_exog and areg_constant

        dep = self._dependent.ndarray
        exog = cast(Float64Array, self._exog.ndarray)

        root_w = sqrt(self._weight_data.ndarray)
        dep = root_w * dep
        exog = root_w * exog
        denom = root_w.T @ root_w
        mu_dep = (root_w.T @ dep) / denom
        mu_exog = (root_w.T @ exog) / denom

        absorb_options = {} if absorb_options is None else absorb_options
        assert isinstance(self._regressors, sp.csc_matrix)
        if self._regressors.shape[1] > 0:
            if use_hdfe:
                from pyhdfe import create

                absorb_options["drop_singletons"] = False
                algo = create(self._absorb_inter.cat, **absorb_options)
                dep_exog = column_stack((dep, exog))
                resids = algo.residualize(dep_exog)
                dep_resid = resids[:, :1]
                exog_resid = resids[:, 1:]
            else:
                self._regressors = preconditioner(self._regressors)[0]
                dep_exog = column_stack((dep, exog))
                resid = lsmr_annihilate(
                    self._regressors,
                    dep_exog,
                    use_cache,
                    self._regressors_hash,
                    **absorb_options,
                )
                dep_resid = resid[:, :1]
                exog_resid = resid[:, 1:]
        else:
            dep_resid = dep
            exog_resid = exog

        if self._constant_absorbed:
            dep_resid += root_w * mu_dep
            exog_resid += root_w * mu_exog

        if not self._drop_absorbed:
            check_absorbed(exog_resid, self.exog.cols, exog)
        else:
            ncol = exog_resid.shape[1]
            retain = not_absorbed(exog_resid)
            if not retain:
                raise ValueError(
                    "All columns in exog have been fully absorbed by the "
                    "included effects. This model cannot be estimated.")
            elif len(retain) < ncol:
                drop = set(range(ncol)).difference(retain)
                dropped = ", ".join([str(self.exog.cols[i]) for i in drop])
                warnings.warn(
                    absorbing_warn_msg.format(absorbed_variables=dropped),
                    AbsorbingEffectWarning,
                )

            exog_resid = exog_resid[:, retain]
            self._columns = [self._columns[i] for i in retain]

        self._absorbed_dependent = DataFrame(
            dep_resid,
            index=self._dependent.pandas.index,
            columns=self._dependent.pandas.columns,
        )
        self._absorbed_exog = DataFrame(exog_resid,
                                        index=self._exog.pandas.index,
                                        columns=self._columns)
Ejemplo n.º 7
0
#
######################################################################### PYHDFE TEST

# for a in list(enumerate(list(df))):
#     print(a)

df_np = df.to_numpy()

# just a sanity check of straight forward regression
#print("ln_wage ~ hours_log")
#model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log']))
#results = model.fit()
#print(results.summary())


algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False),
                        degrees_method='pairwise')
residualized = algo.residualize(get_np_columns(df, ['ln_wage', 'hours_log'], False))

print(algo.degrees)

import pdb; pdb.set_trace()



#model = sm.OLS(residualized[:,0], np.ones((residualized.shape[0], 1)))
model = sm.OLS(residualized[:,0], add_intercept(residualized[:, 1]))

#print(add_intercept(residualized[:,1])[:10])

ids = get_np_columns(df, ['idcode', 'year'], False)
df_kappas2['irat'] = df_kappas2['kappa'] / df_kappas2['cosine']
# Fixed Effects
df_kappas2['pair_fe'] = df_kappas2.groupby(['from', 'to']).ngroup()
df_kappas2['quarter_fe'] = df_kappas2.groupby(['quarter']).ngroup()

# Report the size of everything
print("N of Overall Dataframe:", len(df_kappas2))
print("N Quarter FE:", len(df_kappas2.quarter_fe.unique()))
print("N Pair FE:", len(df_kappas2.pair_fe.unique()))

# Take the logs of everything and get a NumPy array
variables = np.log(df_kappas2[['kappa', 'cosine', 'irat']]).values

# Use pyhdfe for high-dimensional fixed effects absorption
# This takes 13min on my iMac
resid_cs = pyhdfe.create(df_kappas2[['quarter_fe'
                                     ]].values).residualize(variables)
resid_ts = pyhdfe.create(df_kappas2[['pair_fe']].values).residualize(variables)
resid_pa = pyhdfe.create(df_kappas2[['pair_fe', 'quarter_fe'
                                     ]].values).residualize(variables)

# Do the Variance Decomposition for each case
tab_mat = np.vstack([
    do_decomp(variables),
    do_decomp(resid_cs),
    do_decomp(resid_ts),
    do_decomp(resid_pa)
]) * 100.0
table3 = pd.DataFrame(
    tab_mat,
    index=['Raw', 'Cross-Section', 'Time-Series', 'Panel'],
    columns=['Overlapping Ownership', 'Relative IHHI', 'Covariance'])
Ejemplo n.º 9
0
    def __init__(self,
                 df,
                 target,
                 predictors,
                 absorb_ids=[],
                 cluster_ids=[],
                 drop_singletons=True,
                 intercept=False):
        """Regression wrapper for PyHDFE.

        Args:
            df (pandas Dataframe): dataframe containing referenced data
                    which includes target, predictors and absorb and cluster.
            target (string): name of target variable - the y in y = X*b + e.
            predictors (string or list of strings): names of predictors, the X in y = X*b + e.
            absorb_ids (string or list of strings): names of variables to be absorbed for fixed effects.
            cluster_ids (string or list of strings): names of variables to be clustered on.
            drop_singletons (bool): indicates whether to drop singleton groups. Defaults is True, same as stata. Setting to False is equivalent to passing keepsingletons to reghdfe.
        """
        self.df = df
        # in case user has not wrapped singular strings in a list
        if isinstance(predictors, str):
            predictors = [predictors]
        if isinstance(absorb_ids, str):
            absorb_ids = [absorb_ids]
        if isinstance(cluster_ids, str):
            cluster_ids = [cluster_ids]

        self.target = target
        self.predictors = predictors
        self.absorb_ids = absorb_ids
        self.cluster_ids = cluster_ids
        self.drop_singletons = drop_singletons
        # names of all features involved in regression
        self.all_names = [target] + predictors

        # We construct a formula here to feed it into OLS
        # We do this to make the output prettier and give each coefficient
        # meaningful names (otherwise the regression coefficients are named x1, x2 etc.)
        self.formula = target + '~' + predictors[0]
        for name in predictors[1:]:
            self.formula = self.formula + '+' + name

        # if there's stuff to be absorbed
        if absorb_ids:
            # Intercept term is redundant in fixed effects
            self.formula = self.formula + '-1'
            self.algo = pyhdfe.create(ids=get_np_columns(df, absorb_ids),
                                      cluster_ids=get_np_columns(
                                          df, cluster_ids),
                                      drop_singletons=drop_singletons,
                                      degrees_method='pairwise')
            # self.residualized contains features adjusted for fixed effects
            # (i.e. means subtracted, singleton groups dropped etc.)
            self.data = self.algo.residualize(
                get_np_columns(df, [target] + predictors))
        else:
            # otherwise just get np columns as is
            self.data = get_np_columns(df, self.all_names)
            if not intercept:
                self.formula = self.formula + '-1'

        df_data = pd.DataFrame()
        for i, name in enumerate(self.all_names):
            df_data[name] = self.data[:, i]
        y, X = dmatrices(self.formula, data=df_data, return_type='dataframe')
        # now We prepare the cluster groups if they exist
        # We do this here rather than later in fit() since it can be convenient to have
        # access to these groups as early as possible
        # if not empty

        self.model = sm.OLS(y, X)
        if bool(self.cluster_ids):
            # numpy array of group data
            self.groups_np = get_np_columns(self.df, self.cluster_ids)
            if bool(self.absorb_ids):
                # number of groups - already calculated by pyhdfe so We're just retrieving the value
                n_groups = self.algo._groups_list[0].group_count
                # get numpy representation of cluster groups
                # and remove singleton groups (if fixed effects were used)
                self.groups_np = self.groups_np[~self.algo._singleton_indices]
                min_cluster_count = np.unique(self.groups_np[:, 0]).shape[0]
                for i in range(1, self.groups_np.shape[1]):
                    current_count = np.unique(self.groups_np[:, i]).shape[0]
                    if current_count < min_cluster_count:
                        min_cluster_count = current_count

                self.min_cluster_count = min_cluster_count
                self.model.df_resid = min_cluster_count - len(
                    self.predictors) + 1
                self.model._df_resid = self.model.df_resid
Ejemplo n.º 10
0
##print(res.summary)
#
######################################################################### PYHDFE TEST

for a in list(enumerate(list(df))):
    print(a)

df_np = df.to_numpy()

# just a sanity check of straight forward regression
#print("ln_wage ~ hours_log")
#model = sm.OLS(get_np_columns(df, ['ln_wage'], False), get_np_columns(df, ['hours_log']))
#results = model.fit()
#print(results.summary())

algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False))
residualized = algo.residualize(
    get_np_columns(df, ['ln_wage', 'hours_log'], False))

#model = sm.OLS(residualized[:,0], add_intercept(residualized[:,1]))
#ln_wage ~ hours_log, absorb(year)
#                            OLS Regression Results
#==============================================================================
#Dep. Variable:                      y   R-squared:                       0.005
#Model:                            OLS   Adj. R-squared:                  0.005
#Method:                 Least Squares   F-statistic:                     69.95
#Date:                Sat, 05 Dec 2020   Prob (F-statistic):           6.67e-17
#Time:                        13:04:23   Log-Likelihood:                -8305.1
#No. Observations:               13452   AIC:                         1.661e+04
#Df Residuals:                   13450   BIC:                         1.663e+04
#Df Model:                           1
Ejemplo n.º 11
0
import pyhdfe
from utils import add_intercept, get_np_columns

from sklearn.datasets import load_boston

# details about dataset can be found at https://www.kaggle.com/crawford/80-cereals
df = pd.read_csv('/home/abom/Downloads/dataset_cereal/cereal.csv')

print(list(df))

#results = smf.ols(formula='rating ~ fat + protein + carbo + sugars', data=df).fit()
#print(results.summary())

print(get_np_columns(df, ['cups'], False)[:10])

algo = pyhdfe.create(get_np_columns(df, ['shelf'], False))

#                             OLS Regression Results
# ==============================================================================
# Dep. Variable:                      y   R-squared:                       0.759
# Model:                            OLS   Adj. R-squared:                  0.745
# Method:                 Least Squares   F-statistic:                     56.55
# Date:                Mon, 07 Dec 2020   Prob (F-statistic):           1.71e-21
# Time:                        09:15:25   Log-Likelihood:                -252.82
# No. Observations:                  77   AIC:                             515.6
# Df Residuals:                      72   BIC:                             527.4
# Df Model:                           4
# Covariance Type:            nonrobust
# ==============================================================================
#                  coef    std err          t      P>|t|      [0.025      0.975]
# ------------------------------------------------------------------------------