def vif_cal(input_data, dependent_col, endo, instrument, reg):

    if reg == '2SLS':
        x_vars = input_data.drop([dependent_col, instrument], axis=1)
        ins_vars = input_data.drop([dependent_col, endo], axis=1)
    else:
        x_vars = input_data.drop([dependent_col, instrument], axis=1)
        ins_vars = input_data.drop([dependent_col, instrument], axis=1)
    xvar_names = x_vars.columns
    vif = list()
    for i in range(0, xvar_names.shape[0]):
        y = x_vars[xvar_names[i]]
        x = x_vars[xvar_names.drop(xvar_names[i])]
        if reg == '2SLS':
            rsq = IV2SLS(y, x, ins_vars).fit().rsquared
        else:
            rsq = smf.ols(formula="y~x", data=x_vars).fit().rsquared
        vif.append(round(1 / (1 - rsq), 2))

    if reg == 'OLS':
        return pd.DataFrame({
            'Var': xvar_names[i],
            'VIF/OLS': vif[i]
        } for i in range(0, xvar_names.shape[0]))
    elif reg == '2SLS':
        return pd.DataFrame({
            'Var': xvar_names[i],
            'VIF/2SLS': vif[i]
        } for i in range(0, xvar_names.shape[0]))
Esempio n. 2
0
 def _estimate_effect(self):
     if len(self.estimating_instrument_names) == 1 and len(self._treatment_name) == 1:
         instrument = self._estimating_instruments.iloc[:,0]
         self.logger.debug("Instrument Variable values: {0}".format(instrument))
         num_unique_values = len(np.unique(instrument))
         instrument_is_binary = (num_unique_values <= 2)
         if instrument_is_binary:
             # Obtain estimate by Wald Estimator
             y1_z = np.mean(self._outcome[instrument == 1])
             y0_z = np.mean(self._outcome[instrument == 0])
             x1_z = np.mean(self._treatment[self._treatment_name[0]][instrument == 1])
             x0_z = np.mean(self._treatment[self._treatment_name[0]][instrument == 0])
             num = y1_z - y0_z
             deno = x1_z - x0_z
             iv_est = num / deno
         else:
             # Obtain estimate by 2SLS estimator: Cov(y,z) / Cov(x,z)
             num_yz = np.cov(self._outcome, instrument)[0, 1]
             deno_xz = np.cov(self._treatment[self._treatment_name[0]], instrument)[0, 1]
             iv_est = num_yz / deno_xz
     else:
         # More than 1 instrument. Use 2sls.
         est_treatment = self._treatment.astype(np.float32)
         est_outcome = self._outcome.astype(np.float32)
         ivmodel = IV2SLS(est_outcome, est_treatment,
                 self._estimating_instruments)
         reg_results = ivmodel.fit()
         print(reg_results.summary())
         iv_est = sum(reg_results.params) # the effect is the same for any treatment value (assume treatment goes from 0 to 1)
     estimate = CausalEstimate(estimate=iv_est,
                               control_value=self._control_value,
                               treatment_value=self._treatment_value,
                               target_estimand=self._target_estimand,
                               realized_estimand_expr=self.symbolic_estimator)
     return estimate
Esempio n. 3
0
def IVRegression2(data):
    """
    使用工具变量估计模型参数
    """
    data = sm.add_constant(data[["Y", "X1", "X2", "IV1"]])
    model = IV2SLS(data[["Y"]], data[["X1", "X2", "const"]], data[["IV1", "X2", "const"]])
    re = model.fit()
    print("使用工具变量")
    print(re.summary())
    print("Durbin–Wu–Hausman检验")
    print(re.spec_hausman())
Esempio n. 4
0
def TSLS_FIX(df,
             y_var,
             first_y,
             X_vars,
             IV,
             fix1,
             fix2=None,
             add_intercept=True):
    """
    This function replicates probit in STATA, for probit model.
    至少有一个固定效应变量,至多只能有两个。

    Inputs.
    ---------
    df:pd.DataFrame, the data for OLS.
    y_var:str, the column name of the dependent variable
    first_y:str, the column name of the first-stage y
    X_vars:list of str, the list of explanatory variable names
    IV:list str, the list of instrument variable names
    fix1:str, the column name of the first fix effect variable
    fix2:str, the column name of the second fix effect variable

    Outputs.
    ---------
    res:obj

    """
    new_df = df.copy()
    new_df = new_df.dropna()

    if fix2 is None:
        #         data.dropna(subset=[fix1], inplace=True)
        fix2 = 'time_index'
        #         new_df = new_df[[y_var] + [first_y] + X_vars + IV + [fix1]]
        new_df = new_df.groupby(fix1).apply(demean)
    else:
        #         data.dropna(subset=[fix1,fix2], inplace=True)
        #         new_df = new_df[[y_var] + [first_y] + X_vars + IV + [fix1, fix2]]
        new_df = new_df.groupby([fix1, fix2]).apply(demean)

    y = new_df[y_var]

    if add_intercept:
        new_df['intercept'] = 1.0
        X = new_df[['intercept'] + [first_y] + X_vars]
        X_vars = ['intercept'] + X_vars
    else:
        X = new_df[[first_y] + X_vars]

    # IV and all x that is not explained by the IV
    TSLS_mod = IV2SLS(endog=y, exog=X, instrument=new_df[X_vars + IV])

    res = TSLS_mod.fit()
    return res
Esempio n. 5
0
def two_stage_least_squares(z: np.ndarray, x: np.ndarray,
                            y: np.ndarray) -> np.ndarray:
    """Fit 2sls model to data.

  Args:
    z: Instrument
    x: Treatment
    y: Outcome

  Returns:
    coeff: The coefficients of the estimated linear cause-effect relation.
  """
    x = add_constant(onp.array(x))
    z = add_constant(onp.array(z))
    y = onp.array(y)
    iv2sls = IV2SLS(y, x, z).fit()
    logging.info(iv2sls.summary())
    return np.array(iv2sls.params)
Esempio n. 6
0
    def fit(self, X, treatment, y, w):
        """Fits the 2SLS model.

        Args:
            X (np.matrix or np.array or pd.Dataframe): a feature matrix
            treatment (np.array or pd.Series): a treatment vector
            y (np.array or pd.Series): an outcome vector
            w (np.array or pd.Series): an instrument vector
        """

        X, treatment, y, w = convert_pd_to_np(X, treatment, y, w)

        exog = sm.add_constant(np.c_[X, treatment])
        endog = y
        instrument = sm.add_constant(np.c_[X, w])

        self.iv_model = IV2SLS(endog=endog, exog=exog, instrument=instrument)
        self.iv_fit = self.iv_model.fit()
Esempio n. 7
0
    def weak_instruments(self, n_sims=20):

        np.random.seed(1692)

        model = feedforward.FeedForwardModel(19,
                                             1,
                                             dense_size=60,
                                             n_dense_layers=2)

        treatment_effects = []
        ols_betas, ols_ses = [], []
        old_corrs, new_corrs = [], []
        for _ in xrange(n_sims):
            df = self.treatment_gen.simulate_data(False)

            X = np.hstack((self.x, df['new_treat'].values[:, None]))
            Z = np.hstack((self.x, df['instrument'].values[:, None]))

            ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X)
            ols_betas.append(ols_beta)
            ols_ses.append(ols_se)

            old_corr = df[['instrument', 'new_treat']].corr().values[0, 1]
            new_instrument, new_corr = model.fit_instruments(
                X, Z, df['treatment_effect'].values, batchsize=128)
            new_corrs.append(new_corr)
            old_corrs.append(old_corr)

            Z2 = Z.copy()
            Z2[:, -1] = new_instrument[:, 0]

            iv = IV2SLS(df['treatment_effect'].values.flatten(),
                        add_constant(X), add_constant(Z2))

            model.reset_params()

        if new_corr:
            logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs),
                        np.mean(new_corrs))
        logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas),
                    np.mean(ols_ses))
        logger.info("Treatment effect: %.3f (%.4f)",
                    np.mean(treatment_effects), np.std(treatment_effects))
Esempio n. 8
0
def TSLS(df, y_var, firsts_y, X_vars, IV, add_intercept=True):
    """
    This function replicates probit in STATA, for probit model.
    至少有一个固定效应变量,至多只能有两个。

    Inputs.
    ---------
    df:pd.DataFrame, the data for OLS.
    y_var:str, the column name of the dependent variable
    firsts_y:str, the column name of the first-stage y
    X_vars:list of str, the list of explanatory variable names
    IV:list str, the list of instrument variable names

    Outputs.
    ---------
    res:obj

    """
    new_df = df.copy()
    new_df = new_df.dropna()
    y = new_df[y_var]

    if add_intercept:
        new_df['intercept'] = 1.0
        x = ['intercept'] + [firsts_y] + X_vars
        # new_df.dropna(subset=temp, inplace=True)
        X = new_df[x]
        #         X = new_df[['intercept'] + [firsts_y] + X_vars]
        X_vars = ['intercept'] + X_vars
    else:
        x = [firsts_y] + X_vars
        # new_df.dropna(subset=temp, inplace=True)
        X = new_df[x]
        # X = new_df[[firsts_y] + X_vars]

    # IV and all x that is not explained by the IV
    TSLS_mod = IV2SLS(endog=y, exog=X, instrument=new_df[X_vars + IV])
    res = TSLS_mod.fit()

    return res
Esempio n. 9
0
plt.show()

# Or it can be plotted another way
plt.scatter(CollegeDistance['distance'], CollegeDistance['education'])

# Run the regression
reg = smf.ols('education ~ distance', data=CollegeDistance).fit()
print(reg.summary())

# Still true with controls?
reg = smf.ols('education ~ distance + gender + ethnicity + unemp + urban',
              data=CollegeDistance).fit()
print(reg.summary())

# And robust standard errors
reg_robust = reg.get_robustcov_results(cov_type='HC1')
print(reg_robust.summary())

# Ok, so let's use it as an instrument
# To run an Instrumental Variables Regression, use the command IV2SLS:

CollegeDistance['const'] = 1

iv = IV2SLS(
    dependent=CollegeDistance['wage'],
    exog=CollegeDistance[['const', 'gender', 'ethnicity', 'unemp', 'urban']],
    endog=CollegeDistance['education'],
    instruments=CollegeDistance['distance']).fit()

print(iv.summary)
def table13_ext9(df, name, trust):

    dependent = [
        'voice', 'PolStab', 'GovEffec', 'RegQual', 'RulLaw', 'ConCorr'
    ]

    table = [f'table{i}' for i in range(6)]

    for dep, i in zip(dependent, range(6)):
        df_13 = df[[
            f'{name}_C2', f'{name}_instrument_C2_thresh', f'{name}_I', 'voice',
            'PolStab', 'GovEffec', 'RegQual', 'RulLaw', 'ConCorr', 'trust',
            'ethnic_party_dum', 'dummy_sepx_nm'
        ]].dropna(axis=0)

        y1 = df_13[f'{dep}']

        x1 = sm.add_constant(df_13[[f'{name}_C2', f'{name}_I']])
        x2 = sm.add_constant(df_13[[f'{name}_C2', f'{name}_I', 'trust']])
        x3 = sm.add_constant(df_13[[
            f'{name}_C2', f'{name}_I', 'trust', 'ethnic_party_dum',
            'dummy_sepx_nm'
        ]])

        ins1 = sm.add_constant(
            df_13[[f'{name}_instrument_C2_thresh', f'{name}_I']])
        ins2 = sm.add_constant(
            df_13[[f'{name}_instrument_C2_thresh', f'{name}_I', 'trust']])
        ins3 = sm.add_constant(df_13[[
            f'{name}_instrument_C2_thresh', f'{name}_I', 'trust',
            'ethnic_party_dum', 'dummy_sepx_nm'
        ]])

        est = [f'est{i}' for i in range(6)]

        est[0] = sm.OLS(y1, x1).fit(cov_type='HC1')
        est[1] = sm.OLS(y1, x2).fit(cov_type='HC1')
        est[2] = sm.OLS(y1, x3).fit(cov_type='HC1')
        est[3] = IV2SLS(y1, x1, ins1).fit()
        est[4] = IV2SLS(y1, x2, ins2).fit()
        est[5] = IV2SLS(y1, x3, ins3).fit()

        if trust == 'trust':

            table[i] = pd.DataFrame(
                {
                    'OLS / Trust': [
                        est[1].params.values[3], est[1].bse.values[3],
                        est[1].pvalues[3]
                    ],
                    'OLS / All': [
                        est[2].params.values[3], est[2].bse.values[3],
                        est[2].pvalues[3]
                    ],
                    '2SLS / Trust': [
                        est[4].params.values[3], est[4].bse.values[3],
                        est[4].pvalues[3]
                    ],
                    '2SLS / All': [
                        est[5].params.values[3], est[5].bse.values[3],
                        est[5].pvalues[3]
                    ]
                },
                index=['Trust', 'Standard Error', 'p-value'])
            table[i].index = pd.MultiIndex.from_product([[f'{dep}'],
                                                         table[i].index])

        else:
            table[i] = pd.DataFrame(
                {
                    'OLS / None': [
                        est[0].params.values[1], est[0].bse.values[1],
                        est[0].pvalues[1]
                    ],
                    'OLS / Trust': [
                        est[1].params.values[1], est[1].bse.values[1],
                        est[1].pvalues[1]
                    ],
                    'OLS / All': [
                        est[2].params.values[1], est[2].bse.values[1],
                        est[2].pvalues[1]
                    ],
                    '2SLS / None': [
                        est[3].params.values[1], est[3].bse.values[1],
                        est[3].pvalues[1]
                    ],
                    '2SLS / Trust': [
                        est[4].params.values[1], est[4].bse.values[1],
                        est[4].pvalues[1]
                    ],
                    '2SLS / All': [
                        est[5].params.values[1], est[5].bse.values[1],
                        est[5].pvalues[1]
                    ]
                },
                index=['Segregation', 'Standard Error', 'p-value'])
            table[i].index = pd.MultiIndex.from_product([[f'{dep}'],
                                                         table[i].index])

    table = pd.concat(table)

    table = table.rename(
        index={
            'voice': 'Voice',
            'PolStab': 'Political stability',
            'GovEffec': 'Govern-t effectiv.',
            'RegQual': 'Regul. quality',
            'RulLaw': 'Rule of law',
            'ConCorr': 'Control of corr'
        })
    table.index.names = ['Dependent Var', '']

    return table
def df_table12(df, name):
    df_table12 = df[[
        f'{name}_C2', f'{name}_instrument_C2_thresh', f'{name}_I', 'trust',
        'democ', 'lnpopulation', 'lnArea', 'lnGDP_pc', 'protestants',
        'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman',
        'LOSocialist', 'LOScandin', 'mtnall'
    ]].dropna(axis=0)

    df_demo = df_table12[df_table12.democ > 1]

    dep1 = df_table12['trust']
    dep2 = df_demo['trust']

    exo1 = sm.add_constant(df_table12[f'{name}_C2'])
    exo2 = sm.add_constant(df_table12[[
        f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnArea', 'lnGDP_pc',
        'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish',
        'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])
    exo3 = sm.add_constant(df_demo[[
        f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnArea', 'lnGDP_pc',
        'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish',
        'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])

    ins1 = sm.add_constant(df_table12[f'{name}_instrument_C2_thresh'])
    ins2 = sm.add_constant(df_table12[[
        f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation', 'lnArea',
        'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
        'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])
    ins3 = sm.add_constant(df_demo[[
        f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation', 'lnArea',
        'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
        'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])

    reg1 = sm.OLS(dep1, exo1).fit(cov_type='HC1')
    reg2 = sm.OLS(dep1, exo2).fit(cov_type='HC1')
    reg3 = sm.OLS(dep2, exo3).fit(cov_type='HC1')
    reg4 = IV2SLS(dep1, exo1, ins1).fit()
    reg5 = IV2SLS(dep1, exo2, ins2).fit()
    reg6 = IV2SLS(dep2, exo3, ins3).fit()

    stargazer = Stargazer([reg1, reg2, reg3, reg4, reg5, reg6])
    stargazer.covariate_order([f'{name}_C2', f'{name}_I'])
    stargazer.rename_covariates({
        f'{name}_C2':
        'Segregation $\hat{S}$ ('
        f'{name}'
        ')',
        f'{name}_I':
        'Fractionalization $F$ ('
        f'{name}'
        ')'
    })

    stargazer.custom_columns(['OLS', 'OLS', 'OLS', '2SLS', '2SLS', '2SLS'],
                             [1, 1, 1, 1, 1, 1])
    stargazer.add_line('Controls', ['No', 'Yes', 'Yes', 'No', 'Yes', 'Yes'])
    stargazer.add_line('Sample',
                       ['Full', 'Full', 'Democ', 'Full', 'Full', 'Democ'])

    if name == 'ethnicity':
        stargazer.title('Panel A. Ethnicity')
        return stargazer

    else:
        stargazer.title('Panel B. Language')
        return stargazer
def table10_11(df, name, democ):

    full_x = [
        f'{name}_I', f'{name}_C2', 'lnpopulation', 'lnGDP_pc', 'protestants',
        'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman',
        'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]
    ins = [
        f'{name}_I', f'{name}_instrument_C2_thresh', 'lnpopulation',
        'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
        'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]

    df_10_11_1 = df[[
        f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin',
        'democ', 'mtnall', 'icrg_qog'
    ]].dropna(axis=0)
    df_10_11_2 = df[[
        f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin',
        'democ', 'mtnall', 'ef_regul', 'ef_corruption', 'ef_property_rights'
    ]].dropna(axis=0)
    df_10_11_3 = df[[
        f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin',
        'democ', 'mtnall', 'taxevas'
    ]].dropna(axis=0)

    if democ == 'democracy':
        df_10_11_1 = df_10_11_1[df_10_11_1.democ >= 1]
        df_10_11_2 = df_10_11_2[df_10_11_2.democ >= 1]
        df_10_11_3 = df_10_11_3[df_10_11_3.democ >= 1]

        x1 = sm.add_constant(df_10_11_1[full_x])
        x2 = sm.add_constant(df_10_11_2[full_x])
        x3 = sm.add_constant(df_10_11_3[full_x])

        ins1 = sm.add_constant(df_10_11_1[ins])
        ins2 = sm.add_constant(df_10_11_2[ins])
        ins3 = sm.add_constant(df_10_11_3[ins])

    else:
        x1 = sm.add_constant(df_10_11_1[[f'{name}_I', f'{name}_C2']])
        x2 = sm.add_constant(df_10_11_2[[f'{name}_I', f'{name}_C2']])
        x3 = sm.add_constant(df_10_11_3[[f'{name}_I', f'{name}_C2']])

        ins1 = sm.add_constant(
            df_10_11_1[[f'{name}_I', f'{name}_instrument_C2_thresh']])
        ins2 = sm.add_constant(
            df_10_11_2[[f'{name}_I', f'{name}_instrument_C2_thresh']])
        ins3 = sm.add_constant(
            df_10_11_3[[f'{name}_I', f'{name}_instrument_C2_thresh']])

    y1 = df_10_11_1['icrg_qog']
    y2 = df_10_11_2['ef_corruption']
    y3 = df_10_11_2['ef_property_rights']
    y4 = df_10_11_2['ef_regul']
    y5 = df_10_11_3['taxevas']

    est1 = sm.OLS(y1, x1).fit(cov_type='HC1')
    est2 = IV2SLS(y1, x1, ins1).fit()
    est3 = sm.OLS(y2, x2).fit(cov_type='HC1')
    est4 = IV2SLS(y2, x2, ins2).fit()
    est5 = sm.OLS(y3, x2).fit(cov_type='HC1')
    est6 = IV2SLS(y3, x2, ins2).fit()
    est7 = sm.OLS(y4, x2).fit(cov_type='HC1')
    est8 = IV2SLS(y4, x2, ins2).fit()
    est9 = sm.OLS(y5, x3).fit(cov_type='HC1')
    est10 = IV2SLS(y5, x3, ins3).fit()

    stargazer = Stargazer(
        [est1, est2, est3, est4, est5, est6, est7, est8, est9, est10])
    stargazer.custom_columns([
        'ICRG quality of gov', 'EF Corruption', 'EF Property rights',
        'EF Regulation', 'Tax eva'
    ], [2, 2, 2, 2, 2])
    stargazer.show_model_numbers(False)
    stargazer.covariate_order([f'{name}_C2', f'{name}_I'])
    stargazer.rename_covariates({
        f'{name}_C2':
        'Segregation $\hat{S}$ ('
        f'{name}'
        ')',
        f'{name}_I':
        'Fractionalization $F$ ('
        f'{name}'
        ')'
    })
    stargazer.add_line('Method', [
        'OLS', '2SLS', 'OLS', '2SLS', 'OLS', '2SLS', 'OLS', '2SLS', 'OLS',
        '2SLS'
    ])

    if democ == 'democracy':
        stargazer.title('Panel B. Democracies sample, all controls')
        return stargazer

    else:
        stargazer.title('Panel A. Full sample, no additional controls')
        return stargazer
# -*- coding: utf-8 -*-
#%% NumPyの読み込み
import numpy as np
#   SciPyのstatsモジュールの読み込み
import scipy.stats as st
#   statsmodelsの読み込み
import statsmodels.api as sm
#   2段階最小2乗法を実行するIV2SLSとGMMを実行するIVGMMの読み込み
from statsmodels.sandbox.regression.gmm import IV2SLS, IVGMM
#%% RdatasetsからMrozの読み込み
mroz = sm.datasets.get_rdataset('Mroz', 'Ecdat')
mroz.data = mroz.data[mroz.data['hearnw'] > 0]
print(st.pearsonr(mroz.data['educw'], mroz.data['educwf']))
print(st.pearsonr(mroz.data['educw'], mroz.data['educwm']))
#%% 収入を教育年数で説明する単回帰モデル
y = np.log(mroz.data['hearnw'])
x = mroz.data[['educw']]
X = sm.add_constant(x)
results_ols = sm.OLS(y, X).fit(use_t=False)
print(results_ols.summary())
#%% 父母の教育年数を操作変数として使う2SLS
z = mroz.data[['educwf', 'educwm']]
Z = sm.add_constant(z)
results_iv = IV2SLS(y, X, instrument=Z).fit()
print(results_iv.summary())
#%% 2SLSの代わりにGMMでIV推定量を求める
results_gmm = IVGMM(y, X, instrument=Z).fit()
print(results_gmm.summary())
def table3_7(df, regression_type):

    df_3_7E = df[[
        'ethnicity_C2', 'ethnicity_instrument_C2_thresh', 'ethnicity_I',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea',
        'LOScandin', 'democ', 'mtnall', 'RulLaw'
    ]].dropna(axis=0)
    df_3_7L = df[[
        'language_C2', 'language_instrument_C2_thresh', 'language_I',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea',
        'LOScandin', 'democ', 'mtnall', 'RulLaw'
    ]].dropna(axis=0)
    df_3_7R = df[[
        'religion_C2', 'religion_instrument_C2_thresh', 'religion_I',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'lnArea',
        'LOScandin', 'democ', 'mtnall', 'RulLaw'
    ]].dropna(axis=0)

    exo = sm.add_constant(df_3_7E[[
        'ethnicity_C2', 'ethnicity_I', 'lnpopulation', 'lnGDP_pc',
        'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish',
        'LOGerman', 'LOSocialist', 'LOScandin', 'lnArea', 'democ', 'mtnall'
    ]])
    exo2 = sm.add_constant(df_3_7E[['ethnicity_C2', 'ethnicity_I']])
    exo3 = sm.add_constant(df_3_7L[[
        'language_C2', 'language_I', 'lnpopulation', 'lnGDP_pc', 'protestants',
        'lnArea', 'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman',
        'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])
    exo4 = sm.add_constant(df_3_7L[['language_C2', 'language_I']])
    exo5 = sm.add_constant(df_3_7R[[
        'religion_C2', 'religion_I', 'lnpopulation', 'lnGDP_pc', 'protestants',
        'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman',
        'LOSocialist', 'lnArea', 'democ', 'mtnall'
    ]])
    exo6 = sm.add_constant(df_3_7R[['religion_C2', 'religion_I']])

    if regression_type == 'IV2SLS':

        reg = IV2SLS(
            df_3_7E['RulLaw'], exo,
            sm.add_constant(df_3_7E[[
                'ethnicity_instrument_C2_thresh', 'ethnicity_I',
                'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims',
                'catholics', 'latitude', 'LOEnglish', 'LOGerman',
                'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'lnArea'
            ]])).fit()
        reg2 = IV2SLS(
            df_3_7E['RulLaw'], exo2,
            sm.add_constant(
                df_3_7E[['ethnicity_instrument_C2_thresh',
                         'ethnicity_I']])).fit()
        reg3 = IV2SLS(
            df_3_7L['RulLaw'], exo3,
            sm.add_constant(df_3_7L[[
                'language_instrument_C2_thresh', 'language_I', 'lnpopulation',
                'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
                'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ',
                'mtnall', 'lnArea'
            ]])).fit()
        reg4 = IV2SLS(
            df_3_7L['RulLaw'], exo4,
            sm.add_constant(
                df_3_7L[['language_instrument_C2_thresh',
                         'language_I']])).fit()
        reg5 = IV2SLS(
            df_3_7R['RulLaw'], exo5,
            sm.add_constant(df_3_7R[[
                'religion_instrument_C2_thresh', 'religion_I', 'lnpopulation',
                'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
                'LOEnglish', 'LOGerman', 'LOSocialist', 'democ', 'mtnall',
                'lnArea'
            ]])).fit()
        reg6 = IV2SLS(
            df_3_7R['RulLaw'], exo6,
            sm.add_constant(
                df_3_7R[['religion_instrument_C2_thresh',
                         'religion_I']])).fit()
    elif regression_type == 'OLS':
        reg2 = sm.OLS(df_3_7E['RulLaw'], exo2).fit(cov_type='HC1')
        reg = sm.OLS(df_3_7E['RulLaw'], exo).fit(cov_type='HC1')
        reg4 = sm.OLS(df_3_7L['RulLaw'], exo4).fit(cov_type='HC1')
        reg3 = sm.OLS(df_3_7L['RulLaw'], exo3).fit(cov_type='HC1')
        reg6 = sm.OLS(df_3_7R['RulLaw'], exo6).fit(cov_type='HC1')
        reg5 = sm.OLS(df_3_7R['RulLaw'], exo5).fit(cov_type='HC1')

    stargazer = Stargazer([reg2, reg, reg4, reg3, reg6, reg5])
    stargazer.covariate_order([
        'ethnicity_C2', 'ethnicity_I', 'language_C2', 'language_I',
        'religion_C2', 'religion_I', 'lnpopulation', 'lnGDP_pc', 'lnArea',
        'protestants', 'muslims', 'catholics', 'latitude', 'LOEnglish',
        'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall', 'const'
    ])
    stargazer.rename_covariates({
        'ethnicity_C2': 'Segregation $\hat{S}$ (ethnicity)',
        'ethnicity_I': 'Fractionalization $F$ (ethnicity)',
        'language_C2': 'Segregation $\hat{S}$ (language)',
        'language_I': 'Fractionalization $F$ (language)',
        'religion_C2': 'Segregation $\hat{S}$ (religion)',
        'religion_I': 'Fractionalization $F$ (religion)',
        'lnpopulation': 'ln (population)',
        'lnGDP_pc': 'ln (GDP per capita)',
        'lnArea': 'ln (average size of region)',
        'protestants': 'Pretestants share',
        'muslims': 'Muslmis Share',
        'catholics': 'Catholics share',
        'latitude': 'Latitude',
        'LOEnglish': 'English legal origin',
        'LOGerman': 'German legal origin',
        'LOSocialist': 'Socialist legal origin',
        'LOScandin': 'Scandinavian legal origin',
        'democ': 'Democratic tradition',
        'mtnall': 'Mountains',
        'const': 'Constant'
    })
    return HTML(stargazer.render_html())
Esempio n. 15
0
### STORE DATA FOR 1995 ONLY TO AVOID WORKING WITH PANEL DATA
df_1995 = df[df['year'] == 1995]

### CALCULATE LINEAR REGRESSION - 1 STAGE OLS
lm = smf.ols(
    'np.log(packpc) ~ np.log(real_price)', data=df_1995
)  # ==> Coefficientes are interpreted as demand elasticity for cigarettes.
# ==> An increase in 1% of prices should cause a 1,21% decrease in demand.
fit_lm = lm.fit()
print(fit_lm.summary())

### CALCULATE LINEAR REGRESSION WITH INSTRUMENTAL VARIABLES AS ENDOGENOUS VARIABLES ARE INVOLVED IN THE CASE - 2 STAGE OLS
lm2_1 = smf.ols('np.log(real_price) ~ sales_tax', data=df_1995)
fit_lm2_1 = lm2_1.fit()
print(
    fit_lm2_1.summary()
)  # ==> As expected by the results of calculated correlations, R2 is relatively high.
# ==> In univariate regression models, R2 tends to be similar with correlation.
### 2 STAGE
orig = np.log(df_1995['real_price']).values
fitted = fit_lm2_1.fittedvalues.values
lm2_2 = smf.ols('np.log(packpc) ~ fitted', data=df_1995)
fit_lm2_2 = lm2_2.fit()
print(
    fit_lm2_2.summary()
)  # ==> Obtain corrected regression with revised elasticy for cigarettes. However, Standard Deviations are manually calculated.

fit_lm2_2_iv = IV2SLS(np.log(df_1995['packpc']),
                      fitted,
                      instrument=df_1995['sales_tax']).fit()
print(fit_lm2_2_iv.summary())
Esempio n. 16
0
# Run the hypothesis test that the coefficient on electric is 0:
hypothesis = '(electric = 0)'
print(relevancy_results.f_test(hypothesis))
# Part 4: Instrumenting using two-stage least squares
#
no_null_iv = fertility[(fertility['agefbrth'].notnull()) & (fertility['electric'].notnull()) & 
                    (fertility['monthfm'].notnull()) & (fertility['ceb'].notnull()) & (fertility['educ'].notnull())
                      & (fertility['idlnchld'].notnull())]
endog = no_null_iv['agefbrth']
exog = no_null_iv[['monthfm', 'ceb', 'idlnchld', 'educ']]
instr = no_null_iv[['monthfm', 'ceb', 'idlnchld', 'electric']]
dep_var_iv = no_null_iv['agefbrth']
#
exog_constant = sm.add_constant(exog)
instr_constant = sm.add_constant(instr)
no_endog_results = IV2SLS(endog, exog_constant, instrument = instr_constant).fit()
#
no_endog_results.summary()
#
print_resids(no_endog_results.predict(), no_endog_results.resid)
#
print("the descriptive statistics for the errors and a histogram of them:\n\n", no_endog_results.resid.describe())
sns.distplot(no_endog_results.resid);
# Part 5: replicate using matrix algebra
x_mat_ols = np.matrix(x_const)
y_mat_ols = np.matrix(y)
y_mat_ols = np.reshape(y_mat_ols, (-1, 1)) #reshape so that its a single column vector, not row vector
b_ols = np.linalg.inv(x_mat_ols.T*x_mat_ols)*x_mat_ols.T*y_mat_ols
print(b_ols)
#
y_iv_mat = np.matrix(endog)
Esempio n. 17
0
# model that *part* of the y_{t} - y_{t-1} is an independent endogenous variable
# To correct for this we would have to do the following
y_instrumented = macro_mod.wexog[0][:, 1]
whitened_ydiff = y_instrumented - y[:-1]
wexog = np.column_stack((macrodata['tbilrate'][1:], whitened_ydiff))
wexog = sm.add_constant(wexog, prepend=True)
correct_params = sm.GLS(macrodata['realinv'][1:], wexog).fit().params

print "If we correctly instrument everything, then these are the parameters"
print "for the second equation"
print correct_params
print "Compare to output of R script statsmodels/sandbox/tests/macrodata.s"

print '\nUsing IV2SLS'
from statsmodels.sandbox.regression.gmm import IV2SLS
miv = IV2SLS(macro_sys[0], macro_sys[1], instruments)
resiv = miv.fit()
print "equation 1"
print resiv.params
miv2 = IV2SLS(macro_sys[2], macro_sys[3], instruments)
resiv2 = miv2.fit()
print "equation 2"
print resiv2.params

### Below is the same example using Greene's data ###

run_greene = 0
if run_greene:
    try:
        data3 = np.genfromtxt('/home/skipper/school/MetricsII/Greene \
TableF5-1.txt',
def table8_9_ext5(df, name, GDP):

    df_8_9A = df[[
        f'{name}_C2', f'{name}_I', f'{name}_instrument_C2_thresh',
        'lnpopulation', 'lnGDP_pc', 'protestants', 'muslims', 'catholics',
        'latitude', 'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin',
        'democ', 'mtnall', 'voice', 'PolStab', 'GovEffec', 'RegQual',
        'ConCorr', 'RulLaw'
    ]].dropna(axis=0)
    df_8_9B = df_8_9A[[
        f'{name}_C2', f'{name}_instrument_C2_thresh', f'{name}_I', 'voice',
        'PolStab', 'GovEffec', 'RegQual', 'ConCorr', 'RulLaw'
    ]]
    if GDP == 'democ':
        df_8_9C = df_8_9A[df_8_9A.democ >= 1]
    elif GDP == 'GDP':
        df_8_9C = df_8_9A[df_8_9A.lnGDP_pc >= 7]

    exoA = sm.add_constant(df_8_9A[[
        f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnGDP_pc', 'protestants',
        'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman',
        'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])

    exoB = sm.add_constant(df_8_9B[[f'{name}_C2', f'{name}_I']])

    exoC = sm.add_constant(df_8_9C[[
        f'{name}_C2', f'{name}_I', 'lnpopulation', 'lnGDP_pc', 'protestants',
        'muslims', 'catholics', 'latitude', 'LOEnglish', 'LOGerman',
        'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])

    insA = sm.add_constant(df_8_9A[[
        f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation',
        'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
        'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])

    insB = sm.add_constant(
        df_8_9B[[f'{name}_instrument_C2_thresh', f'{name}_I']])

    insC = sm.add_constant(df_8_9C[[
        f'{name}_instrument_C2_thresh', f'{name}_I', 'lnpopulation',
        'lnGDP_pc', 'protestants', 'muslims', 'catholics', 'latitude',
        'LOEnglish', 'LOGerman', 'LOSocialist', 'LOScandin', 'democ', 'mtnall'
    ]])

    df_8_9s = [df_8_9A, df_8_9B, df_8_9C]

    exos = [exoA, exoB, exoC]

    inss = [insA, insB, insC]

    y = [[f'y{idx}A', f'y{idx}B', f'y{idx}C'] for idx in range(1, 7)]
    est = [[f'est{idx}A', f'est{idx}B', f'est{idx}C'] for idx in range(1, 7)]

    star = ['starA', 'starB', 'starC']

    for idx, i in enumerate(['A', 'B', 'C']):

        y[0][idx] = df_8_9s[idx]['voice']
        y[1][idx] = df_8_9s[idx]['PolStab']
        y[2][idx] = df_8_9s[idx]['GovEffec']
        y[3][idx] = df_8_9s[idx]['RegQual']
        y[4][idx] = df_8_9s[idx]['RulLaw']
        y[5][idx] = df_8_9s[idx]['ConCorr']

        est[0][idx] = IV2SLS(y[0][idx], exos[idx], inss[idx]).fit()
        est[1][idx] = IV2SLS(y[1][idx], exos[idx], inss[idx]).fit()
        est[2][idx] = IV2SLS(y[2][idx], exos[idx], inss[idx]).fit()
        est[3][idx] = IV2SLS(y[3][idx], exos[idx], inss[idx]).fit()
        est[4][idx] = IV2SLS(y[4][idx], exos[idx], inss[idx]).fit()
        est[5][idx] = IV2SLS(y[5][idx], exos[idx], inss[idx]).fit()

        star[idx] = Stargazer([
            est[0][idx], est[1][idx], est[2][idx], est[3][idx], est[4][idx],
            est[5][idx]
        ])
    for i in range(3):
        star[i].covariate_order([f'{name}_C2', f'{name}_I'])
        star[i].rename_covariates({
            f'{name}_C2':
            'Segregation $\hat{S}$ ('
            f'{name}'
            ')',
            f'{name}_I':
            'Fractionalization $F$ ('
            f'{name}'
            ')'
        })
        star[i].show_model_numbers(False)
        star[i].custom_columns([
            'Voice', 'Political stability', 'Govern-t effectiv.',
            'Regul. quality', 'Rule of law', 'Control of corr'
        ], [1, 1, 1, 1, 1, 1])
    if GDP == 'democ':
        star[0].add_line('Controls',
                         ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'])
        star[0].add_line('Sample',
                         ['Full', 'Full', 'Full', 'Full', 'Full', 'Full'])
        star[1].add_line('Controls', ['No', 'No', 'No', 'No', 'No', 'No'])
        star[1].add_line('Sample',
                         ['Full', 'Full', 'Full', 'Full', 'Full', 'Full'])
        star[2].add_line('Controls',
                         ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'])
        star[2].add_line(
            'Sample', ['Democ', 'Democ', 'Democ', 'Democ', 'Democ', 'Democ'])

        star[0].title('Panel A. Baseline : All controls and full sample')
        star[1].title('Panel B. No controls and full sample')
        star[2].title('Panel C. All controls; sample excludes dictatorship')

        return [star[0], star[1], star[2]]

    if GDP == 'GDP':
        if name == 'ethnicity':
            star[2].title(
                'Panal A. Ethnicity: All controls; sample excludes poorest countries'
            )
        elif name == 'language':
            star[2].title(
                'Panel B. Language: All controls; sample excludes poorest countries'
            )
        return star[2]