def hausman_fe_re(panel_data, inef_formula, weights=None, cov="unadjusted", level=0.05):
    """
    Executes a Hausman test, which H0: there is no correlation between unobserved effects and the independent variables
    It is not necessary to assign the function to an object! But remember to include an intercept in the formulas.

    :param panel_data : dataframe (which must be in a panel structure)
    :param inef_formula : patsy formula for the inefficient model under H0 (fixed effects)
    :param weights : N x 1 Series or vector containing weights to be used in estimation; defaults to None
        Use is recommended when analyzing survey data, passing on the weight available in the survey
    :param cov : str
        unadjusted: common standard errors
        robust: robust standard errors
        kernel: robust to heteroskedacity AND serial autocorrelation
    :param level : significance level for the test. Defaults to 5%.
    """

    ## Random Effects
    if weights is None:
        random = RandomEffects.from_formula(formula=inef_formula, data=panel_data).fit(cov_type=cov)
    else:
        random = RandomEffects.from_formula(formula=inef_formula, data=panel_data, weights=weights).fit(cov_type=cov)

    ## Fixed Effects
    formula_fe = inef_formula + ' + EntityEffects'
    if weights is None:
        fixed = PanelOLS.from_formula(formula=formula_fe, data=panel_data, drop_absorbed=True).fit(cov_type=cov)
    else:
        fixed = PanelOLS.from_formula(formula=formula_fe, data=panel_data,
                                      drop_absorbed=True, weights=weights).fit(cov_type=cov)

    ## Computing the Hausman statistic
    # Difference between asymptotic variances
    var_assin = fixed.cov - random.cov
    # Difference between parameters
    d = fixed.params - random.params
    # Calculating H (statistic)
    H = d.dot(np.linalg.inv(var_assin)).dot(d)
    # Degrees of freedom
    freedom = random.params.size - 1

    # Calculating p-value using chi2 survival function (sf, 1 - cumulative distribution function)
    p = stats.chi2(freedom).sf(H)

    if p < level:
        print(f"The value of H is {round(H, 6)} with {freedom} degrees of freedom in the chi-squared distribution.")
        print(f"The p-value of the test is {round(p, 6)} and, therefore, H0 is REJECTED and fixed effects is preferred")
    else:
        print(f"The value of H is {round(H, 6)} with {freedom} degrees of freedom in the chi-squared distribution.")
        print(f"The p-value of the test is {round(p, 6)} and H0 is NOT REJECTED and random effects is preferred.")
def fixed_effects(panel_data, formula, weights=None, time_effects=False, cov="unadjusted"):
    """
    Fits a standard Fixed Effects model with the corresponding covariance matrix.
    It can be estimated WITH and WITHOUT a constant.
    It is preferred when the unobserved effects are correlated with the error term
    and, therefore, CAN'T estimate constant terms.
    Remember to include an intercept in the formula ('y ~ 1 + x1 + ...') and to assign it to an object!

    :param panel_data : dataframe (which must be in a panel structure)
    :param formula : patsy/R formula (without EntityEffects, will be added inside the function)
    :param weights : N x 1 Series or vector containing weights to be used in estimation; defaults to None
        Use is recommended when analyzing survey data, passing on the weight available in the survey
    :param time_effects : bool, defaults to False
        Whether to include time effects alongside entity effects (and estimate a two-way fixed effects)
    :param cov : str
        unadjusted: common standard errors
        robust: robust standard errors
        kernel: robust to heteroskedacity AND serial autocorrelation
        clustered: clustered standard errors by the entity column
    :return : linearmodels model instance
    """

    ## Creating model instance
    # Defining which effects to control for
    formula += ' + EntityEffects + TimeEffects' if time_effects else ' + EntityEffects'

    ## Creating model instance
    if weights is None:
        mod = PanelOLS.from_formula(formula=formula, data=panel_data, drop_absorbed=True)
    else:
        mod = PanelOLS.from_formula(formula=formula, data=panel_data, drop_absorbed=True, weights=weights)

    ## Fitting with desired covariance matrix
    mod = mod.fit(cov_type='clustered', cluster_entity=True) if cov == 'clustered' else mod.fit(cov_type=cov)

    print(mod.summary)
    return mod
Beispiel #3
0
#%%
import numpy as np
from statsmodels.datasets import grunfeld

data = grunfeld.load_pandas().data
data.year = data.year.astype(np.int64)
# MultiIndex, entity - time
data = data.set_index(['firm', 'year'])
from linearmodels import PanelOLS
mod = PanelOLS(data.invest, data[['value', 'capital']], entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_entity=True)

#%%
from linearmodels import PanelOLS
mod = PanelOLS.from_formula('invest ~ value + capital + EntityEffects', data)
res = mod.fit(cov_type='clustered', cluster_entity=True)

#%%
# Set vars
## Y
y = 'log_min_distance'

## X
x_list = ['ls_num', 'lti', 'ln_loanamout', 'ln_appincome', 'subprime', 'secured', \
               'cb', 'ln_ta', 'ln_emp', 'num_branch', 'ln_pop', 'density', 'hhi', 'ln_mfi',\
               'mean_distance']
x = ' + '.join(x_list)

#------------------------------------------------------------
# Run regressions
#------------------------------------------------------------

# Run Bank + msat dummies
res_msat = PanelOLS.from_formula('{} ~ {}'.format(y,x), data = df_msat).fit(cov_type = 'clustered', cluster_entity = True)

## Save output to txt
text_file = open("Results/Results_baseline_msat.txt", "w")
text_file.write(res_msat.summary.as_text())
text_file.close()

# Run Bankmsa + t dummies
res_msabank = PanelOLS.from_formula('{} ~ {}'.format(y,x), data = df_msabank).fit(cov_type = 'clustered', cluster_entity = True)

## Save output to txt
text_file = open("Results/Results_baseline_msabank.txt", "w")
text_file.write(res_msabank.summary.as_text())
text_file.close()

# Run Bank + t dummies
# Create indicator variables for Difference in Difference
conflict["PostConflict"] = conflict['Year'].apply(lambda x: 1
                                                  if x >= 2014 else 0)
conflict['Treated'] = conflict['intensity'].apply(lambda x: 1 if x > 1 else 0)

# Conduct base Difference in Difference
BaseModel = smf.ols("Pop_percent_change ~ Treated * PostConflict ",
                    data=conflict).fit()
print(BaseModel.summary())

# Difference in Difference with Confounding Factors
CFModel = smf.ols(
    "Pop_percent_change ~ Treated * PostConflict + Hospitals + Population_Percent_Child + Population_Percent_Female + Poverty_Rate + Airport",
    data=conflict).fit()
print(CFModel.summary())

# Difference in Difference by County
CountyModel = smf.ols(
    "Pop_percent_change ~ C(County) + Treated * PostConflict",
    data=conflict).fit()
print(CountyModel.summary())

# Panel OLS
conflict = conflict.set_index(['County', 'Year'])
PanelModel = PanelOLS.from_formula(
    'Pop_percent_change ~ Treated * PostConflict + EntityEffects',
    data=conflict,
    drop_absorbed=True)
PanelModel.fit(cov_type='clustered', cluster_entity=True)
Beispiel #6
0
def run_regression(df):
    df = df.set_index(['county_id', 'year'])
    model = PanelOLS.from_formula('chips_sold ~ 1 + post_tv + EntityEffects + TimeEffects', data = df)
    fit = model.fit()
    
    return(fit)
BetweenModel = BetweenOLS.from_formula('fcs ~ rev_percap + month_Decembre',
                                       data=data,
                                       weights=w)
BetweenModel.fit(cov_type='robust', reweight=True)

# RANDOM EFFECTS
RandomEffectsModel = RandomEffects.from_formula(
    'fcs ~ rev_percap + year + month_Decembre', data=data, weights=w)
REModFit = RandomEffectsModel.fit(cov_type='robust')
REModFit
REModFit.variance_decomposition
REModFit.theta

# BASIC PANEL
PanelModel = PanelOLS.from_formula(
    'fcs ~ 1 + rev_percap + month_Decembre + EntityEffects',
    data=data,
    weights=w)
PanelModel.fit(cov_type='robust')

# INTERPRETATION : TO BE FULLY CHECKED
# une augmentation de 1000 du revenu par rapport à sa moyenne sur a période
# augmente de X le score fcs par rapport à sa moyenne sur a période

#
# ESTIMATION EXCLUDING DECEMBER
#

datajun = data[data['month'].isin(['Juin'])].reset_index(drop=False)
datajun = datajun.drop(columns={'time'})
time_df = datajun[['year', 'month']].drop_duplicates()
time_df = time_df.sort_values('month', ascending=False).sort_values('year')
Beispiel #8
0
def panel_data(train, years_ahead=1):
    """
    It uses a random forest trained on the observed values of a data matrix (selected series codes except those
    in submit_rows_index) to predict the missing values.
    after that, use panel data model for prediction 
    Returns:
      y_pred: prediction values of target
    """
    train_melt = pd.melt(train.iloc[:, 0:38],
                         id_vars=['Country Name', 'Series Code'],
                         value_vars=train.columns[0:36],
                         var_name='year',
                         value_name='value')
    train_melt['year'] = train_melt['year'].str[:4].astype(int)
    panel = train_melt.groupby(['Country Name', 'year',
                                'Series Code'])['value'].mean().unstack()

    # only use code with at least one observed value across 36 years in each country for the imputation data matrix
    left_feature = panel.iloc[:, 9:].isna().groupby('Country Name').sum().max(
        axis=0) <= 18
    pred = panel.iloc[:, 9:].iloc[:, left_feature.values]

    # construct matrix of features across countries
    df = []
    ct_list = list(set(pred.index.get_level_values(0)))
    ct_list = sorted(ct_list)
    for i in ct_list:
        df.append(pred.loc[i])
    predictors = pd.concat(df, axis=1)

    # random forest imputation
    imputer = MissForest()
    predictors_imputed = imputer.fit_transform(predictors)

    panel.reset_index(inplace=True)
    panel.columns = ['Country Name', 'year'] + [
        'y' + str(i) for i in range(1, 10)
    ] + ['x' + str(i) for i in range(1, 1297)]
    nfeature = int(predictors.shape[1] / 214)
    split = list(range(nfeature, predictors_imputed.shape[1], nfeature))
    _ = np.split(predictors_imputed, split, 1)
    predictors_new = pd.DataFrame(np.vstack(_))
    predictors_new['year'] = panel.year
    predictors_new['Country Name'] = panel['Country Name']
    predictors_new.columns = [
        'x' + str(i) for i in range(1, pred.shape[1] + 1)
    ] + ['year', 'Country Name']

    # combine the updated feature matrix and responses
    feature = predictors_new.isna().sum() <= 0  # change to 1
    panel_left = predictors_new.iloc[:, feature.values]
    panel_comb = pd.merge(panel.iloc[:, 0:11], panel_left.shift(years_ahead))

    # Split prediction and target
    panel_train = panel_comb.loc[panel_comb.year < 2007]
    panel_train = panel_train.set_index(['Country Name', 'year'])
    panel_test = panel_comb.loc[panel_comb.year == 2007]
    panel_test = panel_test.set_index(['Country Name', 'year'])

    # panel data model
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        Ypred = pd.DataFrame()
        for i in range(1, 10):
            formula = 'y' + str(i) + '~1+' + '+'.join(
                panel_train.columns[11:].values) + '+EntityEffects'
            mod = PanelOLS.from_formula(formula, panel_train)
            res = mod.fit(cov_type='clustered', cluster_entity=True)
            Ypred['y' + str(i)] = res.predict(data=panel_test).predictions

    # Eval
    Yval = panel_test.iloc[:, :9]
    rmse = np.sqrt(np.nanmean(np.power(Ypred - Yval, 2)))
    print(rmse)

    return Ypred
Beispiel #9
0
gaps.loc[gaps.index, "indcom4"] = 0
gaps.loc[gaps.t == 4, "indcom4"] = 1

gaps.loc[gaps.index, "indcom6"] = 0
gaps.loc[gaps.t == 6, "indcom6"] = 1

gaps = gaps.loc[~gaps.State.isin([
    "Alaska", "Delaware", "Montana", "North Dakota", "South Dakota", "Vermont",
    "Wyoming"
])]
gaps.set_index(["State", "Year"], inplace=True)

gaps["gap"] = gaps["gap"].abs()

model = PanelOLS.from_formula(
    'gap ~ 1 + indcom_4 + indcom_2 + indcom + indcom2 + indcom4 + indcom6',
    data=gaps)

print(model.fit(cov_type="robust"))

###########
###STATE###
###########

starts = pd.read_excel(
    "/home/matt/GitRepos/ElectionData/data/Independent_Commission_Start.xlsx",
    "Sheet1",
    skip_footer=2)
starts["time"] = 1

gaps = get_efficiency_gap("federal")[['State', 'Year', 'gap']]
Beispiel #10
0
dfDeathdata[
    "intDeathsandretail_and_recreation_percent_change_from_baseline3dma"] = (
        dfDeathdata["retail_and_recreation_percent_change_from_baseline3dma"] *
        dfDeathdata["deathsper1m_shifted"])
dfDeathdata[
    "intDeathsandtransit_stations_percent_change_from_baseline3dma"] = (
        dfDeathdata["transit_stations_percent_change_from_baseline3dma"] *
        dfDeathdata["deathsper1m_shifted"])
dfDeathdata["intDeathsandworkplaces_percent_change_from_baseline3dma"] = (
    dfDeathdata["workplaces_percent_change_from_baseline3dma"] *
    dfDeathdata["deathsper1m_shifted"])

### Fixed Effects regression ###
mod = PanelOLS.from_formula(
    "chgdeathsper1m_shifted3dma ~ CriticalCareBeds + GDPpercapita + personsperhousehold + PopulationDensity + retail_and_recreation_percent_change_from_baseline3dma + transit_stations_percent_change_from_baseline3dma + workplaces_percent_change_from_baseline3dma",
    data=dfDeathdata[dfDeathdata["chgdeathsper1m_shifted3dma"].between(1,
                                                                       100)],
)
res = mod.fit(cov_type="clustered", cluster_entity=True)
res

### OLS Regression ###
model = ols(
    "chgdeathsper1m_shifted3dma ~ CriticalCareBeds + GDPpercapita + personsperhousehold + PopulationDensity + retail_and_recreation_percent_change_from_baseline3dma +transit_stations_percent_change_from_baseline3dma",
    data=dfDeathdata[dfDeathdata["chgdeathsper1m_shifted3dma"].between(1, 60)],
    missing="drop",
)
results = model.fit()
results.summary()

# For Dash App: