def compproc(data, v):
    out = xyvars(data, v)
    xdata = out[0]
    ydata = out[1]
    abdata = absdiff(data, v, xdata, ydata)
    X = pd.get_dummies(data['round_age_x'], drop_first=True)
    xvar = v + '_x'
    X['Min_x'] = data['Min_x']
    X['Min_y'] = data['Min_y']
    X['traded'] = data['traded']
    X = sm.add_constant(X)
    absstr = v + '_absdiff'
    mod = PanelOLS(data[absstr], X, entity_effects=True)
    res = mod.fit()
    print(res)
    params = res.params
    tradecoeff = params.loc['traded']
    conf_int = res.conf_int()
    conf_int = conf_int.loc['traded']
    lowconf = conf_int.iloc[0]
    upconf = conf_int.iloc[1]
    absstrm = v + '_absmean'
    absstrsd = v + '_abssd'
    absmean = data[absstrm].mean()
    abssd = data[absstrsd].mean()
    return ([tradecoeff, lowconf, upconf, absmean, abssd])
def kfoldfun(y, X, k):
    rng = np.random.RandomState(seed=12345)
    s = 100
    seeds = np.arange(s)
    tot_error = 0
    rng.shuffle(seeds)
    rsqtot = 0
    for seed in seeds:
        cv = KFold(n_splits=k, shuffle=True, random_state=seed)
        for train_index, valid_index in cv.split(X, y):
            mod = PanelOLS(y.iloc[train_index],
                           X.iloc[train_index],
                           entity_effects=True)
            res = mod.fit(cov_type='clustered')
            pred = mod.predict(res.params, exog=X.iloc[valid_index])
            rsq = 1 - (((y.iloc[valid_index].to_numpy() -
                         pred.to_numpy().transpose())**2).sum()) / ((
                             (y.iloc[valid_index].to_numpy() -
                              y.iloc[valid_index].to_numpy().mean())**2).sum())
            MSPE = np.abs((y.iloc[valid_index].to_numpy() -
                           pred.to_numpy().transpose())).mean()
            tot_error = tot_error + MSPE
            rsqtot = rsqtot + rsq
    print("Mean Absolute Error:")
    print(tot_error / (s * k))
    print("OOS R^2")
    print(rsqtot / (s * k))
    def fixedEffects(
        self,
        y,
        x,
        id,
        year,
        entity_Effects=False,
        time_Effects=False,
        cov_Type="clustered",
        cluster_Entity=True,
        clean_data="greedy",
    ):

        if type(x) != str:
            utterance = (
                "ERROR: Multiple independent regressor approach not yet implemented."
            )
            return utterance

        s = self.map_column_to_sheet(y)

        # prepare data
        v = np.copy(x)
        v = np.append(v, y)
        df = s.cleanData(v, clean_data)

        # set up panel and return fit
        df = df.set_index([id, year])

        mod = PanelOLS(
            df[y], df[x], entity_effects=entity_Effects, time_effects=time_Effects
        )
        utterance = (
            "Here are the results of a fixed effects regression of "
            + str(y)
            + " on "
            + str(x)
        )
        utterance = (
            utterance
            + ", using "
            + str(year)
            + " as the time dimension and "
            + str(id)
            + " as the id dimension.\n\n"
        )
        utterance = utterance + str(
            mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity)
        )

        return QueryResult(
            mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity), utterance
        )
def hausman_fe_re(panel_data, inef_formula, weights=None, cov="unadjusted", level=0.05):
    """
    Executes a Hausman test, which H0: there is no correlation between unobserved effects and the independent variables
    It is not necessary to assign the function to an object! But remember to include an intercept in the formulas.

    :param panel_data : dataframe (which must be in a panel structure)
    :param inef_formula : patsy formula for the inefficient model under H0 (fixed effects)
    :param weights : N x 1 Series or vector containing weights to be used in estimation; defaults to None
        Use is recommended when analyzing survey data, passing on the weight available in the survey
    :param cov : str
        unadjusted: common standard errors
        robust: robust standard errors
        kernel: robust to heteroskedacity AND serial autocorrelation
    :param level : significance level for the test. Defaults to 5%.
    """

    ## Random Effects
    if weights is None:
        random = RandomEffects.from_formula(formula=inef_formula, data=panel_data).fit(cov_type=cov)
    else:
        random = RandomEffects.from_formula(formula=inef_formula, data=panel_data, weights=weights).fit(cov_type=cov)

    ## Fixed Effects
    formula_fe = inef_formula + ' + EntityEffects'
    if weights is None:
        fixed = PanelOLS.from_formula(formula=formula_fe, data=panel_data, drop_absorbed=True).fit(cov_type=cov)
    else:
        fixed = PanelOLS.from_formula(formula=formula_fe, data=panel_data,
                                      drop_absorbed=True, weights=weights).fit(cov_type=cov)

    ## Computing the Hausman statistic
    # Difference between asymptotic variances
    var_assin = fixed.cov - random.cov
    # Difference between parameters
    d = fixed.params - random.params
    # Calculating H (statistic)
    H = d.dot(np.linalg.inv(var_assin)).dot(d)
    # Degrees of freedom
    freedom = random.params.size - 1

    # Calculating p-value using chi2 survival function (sf, 1 - cumulative distribution function)
    p = stats.chi2(freedom).sf(H)

    if p < level:
        print(f"The value of H is {round(H, 6)} with {freedom} degrees of freedom in the chi-squared distribution.")
        print(f"The p-value of the test is {round(p, 6)} and, therefore, H0 is REJECTED and fixed effects is preferred")
    else:
        print(f"The value of H is {round(H, 6)} with {freedom} degrees of freedom in the chi-squared distribution.")
        print(f"The p-value of the test is {round(p, 6)} and H0 is NOT REJECTED and random effects is preferred.")
Example #5
0
def POLS(data, y, xs, includeFixed=False, includeTime=False):
    # xs_str = ' + '.join(xs)
    # formula = f'{y} ~ {xs_str} + 1'
    # print(formula)
    # print(data['c10'].head())
    # if includeFixed:
    #     formula += '+ EntityEffects'
    # if includeTime:
    #     formula += '+ TimeEffects'
    # mod = PanelOLS.from_formula(formula, data=data)
    # if includeFixed:
    #     ori = mod.fit(cov_type='clustered', cluster_entity=True)
    # else:
    #     ori = mod.fit()
    # print("Formula:"+formula)
    # print(ori.params)
    # print(ori.pvalues)
    # print(ori.rsquared_overall)
    # print('\n')

    # data = data.dropna()
    print(xs)

    exog = sm.add_constant(data[xs])
    res = PanelOLS(data[y],
                   exog,
                   entity_effects=includeFixed,
                   time_effects=includeTime).fit()
    return res
Example #6
0
    def panel_regression(self, X, y, entity_col, time_col, entity_effects=False, time_effects=False, other_effects=None, add_const=True, drop_absorbed=True):
        """
        other_effects (array-like) – Category codes to use for any effects that are not entity or time effects. Each variable is treated as an effect
        return fitted res
        """

        X = X.set_index([entity_col, time_col])
        y.index = X.index
        if add_const:
            X = sm.add_constant(X)
        if other_effects is None:
            mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects)#, endog_names=['intercept'] + X.columns)
        else:
            mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects, other_effects=X[other_effects])
        res = mod.fit()
        print(res.summary)
        return res
def run_regression(
    salesWithFlags,
    use_features=None,
    entity_effects=True,
    time_effects=True,
    cov_type="clustered",
    cluster_entity=True,
):
    """
    Run a panel regression on the input sales data.

    Parameters
    ----------
    salesWithFlags : pandas.DataFrame
        the sales data with any interaction flags already added
    use_features : list of str, optional
        if specified, only include these property characteristics in the regression
    entity_effects : bool, optional
        include neighborhood fixed effects
    time_effects : bool, optional
        include year fixed effects
    cov_type : str, optional
        the covariance type to use
    cluster_entity : bool, optional
        if using clustered errors, cluster at the neighborhood level
    """
    from linearmodels import PanelOLS

    # get the modeling inputs
    X, Y = get_modeling_inputs(salesWithFlags,
                               dropna=False,
                               as_panel=True,
                               use_features=use_features)

    # initialize the panel regression
    mod = PanelOLS(Y,
                   X,
                   entity_effects=entity_effects,
                   time_effects=time_effects)

    # return the regression result
    return mod.fit(cov_type=cov_type, cluster_entity=cluster_entity)
def Reg_Painel_Efeitos_Fixos(x, y, constante="S", cov='normal'):
    '''
    Função que calcula uma regressão de efeitos fixos, sendo, por default, computada com um intercepto e com erros padrões não robustos.
    **IMPORTANTE: para o painel estar arrumado, os dados devem estar multi-indexados por indíviduo e por tempo, nesta ordem.
    Caso contrário, transformar o dataframe usando a função 'Arrumar Painel'
    x: lista ou array com os valores das variáveis independentes;
    y: lista ou array com os valores da variável dependente;
    constante: "S" para regressão com intercepto e qualquer outro valor para sem intercepto. Caso em branco, a regressão é computada com intercepto;
    cov: "normal" para regressão com erros-padrão tradicionais (caso padrão);
        "robust" para erros-padrões robustos.
        "cluster" ou "clustered" para erros-padrões clusterizados
    '''
    global df, Resultado

    # formando o vetor de variáveis independentes
    if constante == "S":
        X = sm.add_constant(x)
    else:
        X = x

    #Criando o Modelo levando em conta a opção dos erros padrão
    Modelo = PanelOLS(y, X, entity_effects=True, drop_absorbed=True)

    if cov == "robust":
        Resultado = Modelo.fit(cov_type='robust')
    elif cov == 'kernel':  ## correlação robusta à heteroscedasticidade e autocorrelação serial
        Resultado = Modelo.fit(cov_type='kernel')
    elif cov == 'clustered' or cov == 'cluster':
        Resultado = Modelo.fit(cov_type='clustered', cluster_entity=True)
    else:
        Resultado = Modelo.fit()

    print(Resultado)
def fixed_effects(panel_data, formula, weights=None, time_effects=False, cov="unadjusted"):
    """
    Fits a standard Fixed Effects model with the corresponding covariance matrix.
    It can be estimated WITH and WITHOUT a constant.
    It is preferred when the unobserved effects are correlated with the error term
    and, therefore, CAN'T estimate constant terms.
    Remember to include an intercept in the formula ('y ~ 1 + x1 + ...') and to assign it to an object!

    :param panel_data : dataframe (which must be in a panel structure)
    :param formula : patsy/R formula (without EntityEffects, will be added inside the function)
    :param weights : N x 1 Series or vector containing weights to be used in estimation; defaults to None
        Use is recommended when analyzing survey data, passing on the weight available in the survey
    :param time_effects : bool, defaults to False
        Whether to include time effects alongside entity effects (and estimate a two-way fixed effects)
    :param cov : str
        unadjusted: common standard errors
        robust: robust standard errors
        kernel: robust to heteroskedacity AND serial autocorrelation
        clustered: clustered standard errors by the entity column
    :return : linearmodels model instance
    """

    ## Creating model instance
    # Defining which effects to control for
    formula += ' + EntityEffects + TimeEffects' if time_effects else ' + EntityEffects'

    ## Creating model instance
    if weights is None:
        mod = PanelOLS.from_formula(formula=formula, data=panel_data, drop_absorbed=True)
    else:
        mod = PanelOLS.from_formula(formula=formula, data=panel_data, drop_absorbed=True, weights=weights)

    ## Fitting with desired covariance matrix
    mod = mod.fit(cov_type='clustered', cluster_entity=True) if cov == 'clustered' else mod.fit(cov_type=cov)

    print(mod.summary)
    return mod
Example #10
0
def one_step_panel_fit(data):
    """
    Panel regression is exactly the same as pooled regression!!!
    All coefficients estimation are the same
    """

    fit = PanelOLS(
        data['ret'], data[[
            'const', 'market_cap', 'pe', 'pe_lyr', 'pb', 'ps', 'pcf',
            'turnover'
        ]]).fit()

    logger.info("Panel Regression")
    logger.info(fit)
    resid = fit.resids
    logger.info("Residual auto correlation")
    logger.info(
        format_for_print(
            pd.DataFrame(
                [resid.autocorr(1),
                 resid.autocorr(5),
                 resid.autocorr(20)])))

    return resid
Example #11
0
def run_regression(df):
    df = df.set_index(['county_id', 'year'])
    model = PanelOLS.from_formula('chips_sold ~ 1 + post_tv + EntityEffects + TimeEffects', data = df)
    fit = model.fit()
    
    return(fit)
def estimate_profiles(graphs=False):
    '''
    Function to estimate deterministic lifecycle profiles of hourly
    earnings.  Follows methodology of Fullerton and Rogers (1993).

    Args:
        graphs (bool): whether to create graphs of profiles

    Returns:
        reg_results (Pandas DataFrame): regression model coefficients
            for lifetime earnings profiles

    '''
    # Read in dataframe of PSID data
    df = ogusa.utils.safe_read_pickle(
        os.path.join(cur_path, 'data', 'PSID', 'psid_lifetime_income.pkl'))

    model_results = {
        'Names': [
            'Constant', '', 'Head Age', '', 'Head Age^2', '', 'Head Age^3', '',
            'R-Squared', 'Observations'
        ]
    }
    cats_pct = ['0-25', '26-50', '51-70', '71-80', '81-90', '91-99', '100']
    long_model_results = {
        'Lifetime Income Group': [],
        'Constant': [],
        'Age': [],
        'Age^2': [],
        'Age^3': [],
        'Observations': []
    }
    for i, group in enumerate(cats_pct):
        data = df[df[group] == 1].copy()
        data['ones'] = np.ones(len(data.index))
        mod = PanelOLS(data.ln_earn_rate, data[['ones', 'age', 'age2',
                                                'age3']])
        res = mod.fit(cov_type='clustered', cluster_entity=True)
        # print('Summary for lifetime income group ', group)
        # print(res.summary)
        # Save model results to dictionary
        model_results[group] = [
            res.params['ones'], res.std_errors['ones'], res.params['age'],
            res.std_errors['age'], res.params['age2'], res.std_errors['age2'],
            res.params['age3'], res.std_errors['age3'], res.rsquared, res.nobs
        ]
        long_model_results['Lifetime Income Group'].extend([cats_pct[i], ''])
        long_model_results['Constant'].extend(
            [res.params['ones'], res.std_errors['ones']])
        long_model_results['Age'].extend(
            [res.params['age'], res.std_errors['age']])
        long_model_results['Age^2'].extend(
            [res.params['age2'], res.std_errors['age2']])
        long_model_results['Age^3'].extend(
            [res.params['age3'], res.std_errors['age3']])
        long_model_results['Observations'].extend([res.nobs, ''])

    reg_results = pd.DataFrame.from_dict(model_results)
    reg_results.to_csv(
        os.path.join(output_dir, 'DeterministicProfileRegResults.csv'))
    long_reg_results = pd.DataFrame.from_dict(model_results)
    long_reg_results.to_csv(
        os.path.join(output_dir, 'DeterministicProfileRegResults_long.csv'))

    if graphs:
        # Plot lifecycles of hourly earnings from processes estimated above
        age_vec = np.arange(20, 81, step=1)
        for i, group in enumerate(cats_pct):
            earn_profile = (model_results[group][0] +
                            model_results[group][2] * age_vec +
                            model_results[group][4] * age_vec**2 +
                            model_results[group][6] * age_vec**3)
            plt.plot(age_vec, earn_profile, label=group)
        plt.title(
            'Estimated Lifecycle Earnings Profiles by Lifetime Income Group')
        plt.legend()

        plt.savefig(os.path.join(output_dir,
                                 'lifecycle_earnings_profiles.png'))

        # Plot of lifecycles of hourly earnings from processes from data
        pd.pivot_table(df,
                       values='ln_earn_rate',
                       index='age',
                       columns='li_group',
                       aggfunc='mean').plot(legend=True)
        plt.title(
            'Empirical Lifecycle Earnings Profiles by Lifetime Income Group')

        plt.savefig(
            os.path.join(output_dir, 'lifecycle_earnings_profiles_data.png'))

        # Plot of lifecycle profiles of hours by lifetime income group
        # create variable from fraction of time endowment work
        df['labor_supply'] = (df['earnhours_hh'] / (24 * 5 *
                                                    (df['married'] + 1) * 50))
        pd.pivot_table(df,
                       values='labor_supply',
                       index='age',
                       columns='li_group',
                       aggfunc='mean').plot(legend=True)
        plt.title('Lifecycle Profiles of Hours by Lifetime Income Group')

        plt.savefig(os.path.join(output_dir, 'lifecycle_laborsupply.png'))

    return reg_results
Example #13
0
gaps.loc[gaps.index, "indcom4"] = 0
gaps.loc[gaps.t == 4, "indcom4"] = 1

gaps.loc[gaps.index, "indcom6"] = 0
gaps.loc[gaps.t == 6, "indcom6"] = 1

gaps = gaps.loc[~gaps.State.isin([
    "Alaska", "Delaware", "Montana", "North Dakota", "South Dakota", "Vermont",
    "Wyoming"
])]
gaps.set_index(["State", "Year"], inplace=True)

gaps["gap"] = gaps["gap"].abs()

model = PanelOLS.from_formula(
    'gap ~ 1 + indcom_4 + indcom_2 + indcom + indcom2 + indcom4 + indcom6',
    data=gaps)

print(model.fit(cov_type="robust"))

###########
###STATE###
###########

starts = pd.read_excel(
    "/home/matt/GitRepos/ElectionData/data/Independent_Commission_Start.xlsx",
    "Sheet1",
    skip_footer=2)
starts["time"] = 1

gaps = get_efficiency_gap("federal")[['State', 'Year', 'gap']]
Example #14
0
#%%
import numpy as np
from statsmodels.datasets import grunfeld

data = grunfeld.load_pandas().data
data.year = data.year.astype(np.int64)
# MultiIndex, entity - time
data = data.set_index(['firm', 'year'])
from linearmodels import PanelOLS
mod = PanelOLS(data.invest, data[['value', 'capital']], entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_entity=True)

#%%
from linearmodels import PanelOLS
mod = PanelOLS.from_formula('invest ~ value + capital + EntityEffects', data)
res = mod.fit(cov_type='clustered', cluster_entity=True)

#%%
# Create indicator variables for Difference in Difference
conflict["PostConflict"] = conflict['Year'].apply(lambda x: 1
                                                  if x >= 2014 else 0)
conflict['Treated'] = conflict['intensity'].apply(lambda x: 1 if x > 1 else 0)

# Conduct base Difference in Difference
BaseModel = smf.ols("Pop_percent_change ~ Treated * PostConflict ",
                    data=conflict).fit()
print(BaseModel.summary())

# Difference in Difference with Confounding Factors
CFModel = smf.ols(
    "Pop_percent_change ~ Treated * PostConflict + Hospitals + Population_Percent_Child + Population_Percent_Female + Poverty_Rate + Airport",
    data=conflict).fit()
print(CFModel.summary())

# Difference in Difference by County
CountyModel = smf.ols(
    "Pop_percent_change ~ C(County) + Treated * PostConflict",
    data=conflict).fit()
print(CountyModel.summary())

# Panel OLS
conflict = conflict.set_index(['County', 'Year'])
PanelModel = PanelOLS.from_formula(
    'Pop_percent_change ~ Treated * PostConflict + EntityEffects',
    data=conflict,
    drop_absorbed=True)
PanelModel.fit(cov_type='clustered', cluster_entity=True)
Example #16
0
    ],
    'Single Males': [],
    'Single Females': [],
    'Married, Male Head': [],
    'Married, Female Head': []
}
for i, data in enumerate(list_of_dfs):
    # Note that including entity and time effects leads to a collinearity
    # I think this is because there are some years at begin and end of
    # sample with just one person
    # mod = PanelOLS(data.ln_wage_rate,
    #                data[['age', 'age2', 'age3']],
    #                weights=data.fam_smpl_wgt_core,
    #                entity_effects=True, time_effects=True)
    mod = PanelOLS(data.ln_wage_rate,
                   data[['age', 'age2', 'age3']],
                   entity_effects=True)
    res = mod.fit(cov_type='clustered', cluster_entity=True)
    print('Summary for ', list_of_statuses[i])
    print(res.summary)
    # Save model results to dictionary
    first_stage_model_results[list_of_statuses[i]] = [
        res.params['age'], res.std_errors['age'], res.params['age2'],
        res.std_errors['age2'], res.params['age3'], res.std_errors['age3'],
        res.rsquared, res.nobs, res.entity_info['total']
    ]
    fit_values = res.predict(fitted=True, effects=True, missing=True)
    fit_values['predictions'] = (fit_values['fitted_values'] +
                                 fit_values['estimated_effects'])
    list_of_dfs_with_fitted_vals.append(
        data.join(fit_values, how='left', on=['hh_id', 'year']))
Example #17
0
            if cluster_type in ('random', 'other-random', 'entity-nested',
                                'random-nested'):
                clusters = y.copy()
                if cluster_type == 'random':
                    clusters.dataframe.iloc[:, :] = random_effects
                elif cluster_type == 'other-random':
                    clusters.dataframe.iloc[:, :] = other_random
                elif cluster_type == 'entity_nested':
                    eid = y.entity_ids
                    clusters.dataframe.iloc[:, :] = eid // 3
                elif cluster_type == 'random-nested':
                    clusters.dataframe.iloc[:, :] = random_effects // 2
                fo['clusters'] = clusters

            mod = PanelOLS(data.y, data.x, **mo)
            res = mod.fit(**fo)
            res2 = mod.fit(auto_df=False, count_effects=False, **fo)
            res3 = mod.fit(auto_df=False, count_effects=True, **fo)
            res4 = mod.fit(cov_type='unadjusted')
            res5 = mod.fit(cov_type='unadjusted',
                           auto_df=False,
                           count_effects=False)
            res6 = mod.fit(cov_type='unadjusted',
                           auto_df=False,
                           count_effects=True)

            vals[b] = np.column_stack([
                res.params, res.std_errors, res2.std_errors, res3.std_errors,
                res4.std_errors, res5.std_errors, res6.std_errors
            ])
dfProvince['Ml2_cat1'] = dfProvince['mosquito_lag2'] < q1
dfProvince['Ml2_cat2'] = (dfProvince['mosquito_lag2'] >
                          q1) & (dfProvince['mosquito_lag1'] < q2)
dfProvince['Ml2_cat3'] = (dfProvince['mosquito_lag2'] >
                          q2) & (dfProvince['mosquito_lag1'] < q3)
dfProvince['Ml2_cat4'] = dfProvince['mosquito_lag2'] > q3

#take log of dengue and add its lag in data frame
dfProvince['log_dengue'] = np.log(dfProvince['Dengue'] + 1)
dfProvince['lag_log_dengue'] = dfProvince['log_dengue'].shift(1)

#-------------------- Model ---------------------------------------------
#first specification
X_spec1 = sm.add_constant(
    dfProvince.loc[:, ['lag_log_dengue', 'M_cat2', 'M_cat3', 'M_cat4']])
mod1 = PanelOLS(dfProvince['log_dengue'], X_spec1, entity_effects=True)
res1 = mod1.fit(cov_type='clustered')

print(res1)

#second specification with kfold
X_spec2 = sm.add_constant(dfProvince.loc[:, [
    'M_cat2', 'M_cat3', 'M_cat4', 'Ml_cat2', 'Ml_cat3', 'Ml_cat4', 'Ml2_cat2',
    'Ml2_cat3', 'Ml2_cat4'
]])
mod2 = PanelOLS(dfProvince['log_dengue'], X_spec2, entity_effects=True)
res2 = mod2.fit(cov_type='clustered')
print(res2)

#third specification with kfold
X_spec3 = sm.add_constant(dfProvince.loc[:, [
from pathlib import Path

import numpy as np
import pandas as pd
import statsmodels.api as sm
from studies.age_structure.commons import *
from linearmodels import PanelOLS

# dcons = average daily consumption per household
# rc    = percent change in daily consumption per household relative to 2019m6
df = pd.read_stata("data/datareg.dta")

index = ["districtnum", "month_code"]
rc = df[index + ["rc"]].set_index(index)

exog_cols = ["I_cat", "D_cat", "I_cat_national", "D_cat_national"]
for col in exog_cols:
    df[col] = pd.Categorical(df[col])

exog = df[index + exog_cols].set_index(index)

PanelOLS(rc, exog, entity_effects = True)
def Panel_output(endo, exog):
    X = sm.add_constant(df.loc[:, exog])
    mod = PanelOLS(df['log_dengue'], X, entity_effects=True)
    res = mod.fit(cov_type='clustered')
    print(res)
    return (res.loglik, exog, res)
                           col='country',
                           hue='country',
                           col_wrap=4,
                           palette="deep")
SRPC_other = SRPC_other.map(plt.plot, 'unemployment',
                            'inflation').set_titles("{col_name}")

######## e. Panel data regression analysis #######

#Panel data regression for full sample
merge_eu = merge_eu.reset_index()
year_full = pd.Categorical(merge_eu.year)
merge_eu = merge_eu.set_index(['country', 'year'])
merge_eu['year'] = year_full
regression1 = PanelOLS(merge_eu.inflation,
                       merge_eu.unemployment,
                       entity_effects=True)
res1 = regression1.fit(cov_type='clustered', cluster_entity=True)
print(res1)

# Panel data regression for data after QE
after_QE = after_QE.reset_index()
year_QE = pd.Categorical(after_QE.year)
after_QE = after_QE.set_index(['country', 'year'])
after_QE['year'] = year_QE
regression2 = PanelOLS(after_QE.inflation,
                       after_QE.unemployment,
                       entity_effects=True)
res2 = regression2.fit(cov_type='clustered', cluster_entity=True)
print(res2)
BetweenModel = BetweenOLS.from_formula('fcs ~ rev_percap + month_Decembre',
                                       data=data,
                                       weights=w)
BetweenModel.fit(cov_type='robust', reweight=True)

# RANDOM EFFECTS
RandomEffectsModel = RandomEffects.from_formula(
    'fcs ~ rev_percap + year + month_Decembre', data=data, weights=w)
REModFit = RandomEffectsModel.fit(cov_type='robust')
REModFit
REModFit.variance_decomposition
REModFit.theta

# BASIC PANEL
PanelModel = PanelOLS.from_formula(
    'fcs ~ 1 + rev_percap + month_Decembre + EntityEffects',
    data=data,
    weights=w)
PanelModel.fit(cov_type='robust')

# INTERPRETATION : TO BE FULLY CHECKED
# une augmentation de 1000 du revenu par rapport à sa moyenne sur a période
# augmente de X le score fcs par rapport à sa moyenne sur a période

#
# ESTIMATION EXCLUDING DECEMBER
#

datajun = data[data['month'].isin(['Juin'])].reset_index(drop=False)
datajun = datajun.drop(columns={'time'})
time_df = datajun[['year', 'month']].drop_duplicates()
time_df = time_df.sort_values('month', ascending=False).sort_values('year')
Example #23
0
## X
x_list = ['ls_num', 'lti', 'ln_loanamout', 'ln_appincome', 'subprime', 'secured', \
               'cb', 'ln_ta', 'ln_emp', 'num_branch', 'ln_pop', 'density', 'hhi', 'ln_mfi',\
               'mean_distance']
x = df[x_list]
'''
x_msat_list = x_list + ['dum_msat_{}'.format(i) for i in range(dum_msat.shape[1])]
x_msat = sm.add_constant(df[x_msat_list])         
'''
#------------------------------------------------------------
# Run regression
#------------------------------------------------------------

# Run no dum
res_nd = PanelOLS(y, x).fit(cov_type='clustered', cluster_entity=True)

## Save output to txt
text_file = open("Results/Results_baseline_nodummy.txt", "w")
text_file.write(res_nd.summary.as_text())
text_file.close()

# Run dum_t
res_t = PanelOLS(y, x, entity_effects=True,
                 time_effects=True).fit(cov_type='clustered',
                                        cluster_entity=True)

## Save output to txt
text_file = open("Results/Results_baseline_t.txt", "w")
text_file.write(res_t.summary.as_text())
text_file.close()
Example #24
0
def panel_data(train, years_ahead=1):
    """
    It uses a random forest trained on the observed values of a data matrix (selected series codes except those
    in submit_rows_index) to predict the missing values.
    after that, use panel data model for prediction 
    Returns:
      y_pred: prediction values of target
    """
    train_melt = pd.melt(train.iloc[:, 0:38],
                         id_vars=['Country Name', 'Series Code'],
                         value_vars=train.columns[0:36],
                         var_name='year',
                         value_name='value')
    train_melt['year'] = train_melt['year'].str[:4].astype(int)
    panel = train_melt.groupby(['Country Name', 'year',
                                'Series Code'])['value'].mean().unstack()

    # only use code with at least one observed value across 36 years in each country for the imputation data matrix
    left_feature = panel.iloc[:, 9:].isna().groupby('Country Name').sum().max(
        axis=0) <= 18
    pred = panel.iloc[:, 9:].iloc[:, left_feature.values]

    # construct matrix of features across countries
    df = []
    ct_list = list(set(pred.index.get_level_values(0)))
    ct_list = sorted(ct_list)
    for i in ct_list:
        df.append(pred.loc[i])
    predictors = pd.concat(df, axis=1)

    # random forest imputation
    imputer = MissForest()
    predictors_imputed = imputer.fit_transform(predictors)

    panel.reset_index(inplace=True)
    panel.columns = ['Country Name', 'year'] + [
        'y' + str(i) for i in range(1, 10)
    ] + ['x' + str(i) for i in range(1, 1297)]
    nfeature = int(predictors.shape[1] / 214)
    split = list(range(nfeature, predictors_imputed.shape[1], nfeature))
    _ = np.split(predictors_imputed, split, 1)
    predictors_new = pd.DataFrame(np.vstack(_))
    predictors_new['year'] = panel.year
    predictors_new['Country Name'] = panel['Country Name']
    predictors_new.columns = [
        'x' + str(i) for i in range(1, pred.shape[1] + 1)
    ] + ['year', 'Country Name']

    # combine the updated feature matrix and responses
    feature = predictors_new.isna().sum() <= 0  # change to 1
    panel_left = predictors_new.iloc[:, feature.values]
    panel_comb = pd.merge(panel.iloc[:, 0:11], panel_left.shift(years_ahead))

    # Split prediction and target
    panel_train = panel_comb.loc[panel_comb.year < 2007]
    panel_train = panel_train.set_index(['Country Name', 'year'])
    panel_test = panel_comb.loc[panel_comb.year == 2007]
    panel_test = panel_test.set_index(['Country Name', 'year'])

    # panel data model
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        Ypred = pd.DataFrame()
        for i in range(1, 10):
            formula = 'y' + str(i) + '~1+' + '+'.join(
                panel_train.columns[11:].values) + '+EntityEffects'
            mod = PanelOLS.from_formula(formula, panel_train)
            res = mod.fit(cov_type='clustered', cluster_entity=True)
            Ypred['y' + str(i)] = res.predict(data=panel_test).predictions

    # Eval
    Yval = panel_test.iloc[:, :9]
    rmse = np.sqrt(np.nanmean(np.power(Ypred - Yval, 2)))
    print(rmse)

    return Ypred
Example #25
0
import os
from statsmodels.iolib.summary2 import summary_col
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from linearmodels import PanelOLS
from linearmodels import RandomEffects

if __name__ == "__main__":
    REG_DATA = sys.argv[1]
    RES3_PATH = sys.argv[2]

    metadata = pd.read_csv(REG_DATA)
    metadata = metadata.sort_values(by=['Code', 'Year'])
    metadata = metadata.set_index(['Code', 'Year'])
    metadata['Income_t0_log'] = np.log10(metadata['Income_t0'])

    base = os.path.basename(RES3_PATH)
    incomegroup = base.split(".")[0].split("_")[-1]
    metadata = metadata[metadata.IncomeGroup == incomegroup]
    metadata = metadata.dropna()

    num_period = len(metadata['period'].unique())
    metadata = metadata[metadata['size'] == num_period]

    exog_vars = ['ECI', 'Income_t0_log', 'diversity']
    exog = sm.add_constant(metadata[exog_vars])
    mod = PanelOLS(metadata.growth, exog, entity_effects=True)
    with open(RES3_PATH, 'w') as f:
        f.write(mod.fit().summary.as_text())
Example #26
0
from matplotlib.backends.backend_pdf import PdfPages
from linearmodels import PanelOLS

#import data
data = pd.DataFrame.from_csv("fraserDataWithRGDPPC.csv",
                             index_col=[0, 1],
                             parse_dates=True)

# create list of each index set from multi index
years = list(sorted(set(data.index.get_level_values('Year'))))
country = list(sorted(set(data.index.get_level_values('ISO_Code'))))
#choose variables that will be plotted for each year in scatter
plot_vars = [
    "Sound Money", "Government Consumption", "RGDP Per Capita", "Quartile"
]

# Normalize income so that 1 represents the maximum value of RGDP Per Capita
# This will allow dot to be easily adjusted
data["RGDP Per Capita"] = data["RGDP Per Capita"] / max(
    data["RGDP Per Capita"]) * 1000

# Panel OLS
reg_data = data[[
    "RGDP Per Capita", "Sound Money", "Government Consumption", "SUMMARY INDEX"
]].dropna()
x = reg_data[["Sound Money", "Government Consumption", "SUMMARY INDEX"]]
y = reg_data[["RGDP Per Capita"]]
mod = PanelOLS(y, x, entity_effects=True, time_effects=False)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res.summary)
merge_tot = merge_tot.set_index(["省份", "日期"])
merge_norm = (merge_tot - merge_tot.mean()) / (merge_tot.max() -
                                               merge_tot.min())
print(merge_norm)
from linearmodels import PanelOLS
y = merge_norm[["新增确诊"]]
x = merge_norm[[
    "_1traffic",
    "_2traffic",
    "_3traffic",
    "traffic",
    "traffic3_",
    "traffic2_",
    "traffic1_",
]]  #change here
reg = PanelOLS(y, x, entity_effects=True, time_effects=True)
res = reg.fit(cov_type='clustered', cluster_entity=True)
print(res)
parameters = [0.0433, 0.0231, 0.0075, 0.0176, 0.0053, 0.0034, 0.0086]
xline = [-3, -2, -1, 0, 1, 2, 3]
lower = [-0.0151, 0.0040, 0.0075, 0.0007, -0.0108, -0.0162, -0.017]
upper = [0.1017, 0.0422, 0.0075, 0.0346, 0.0214, 0.0229, 0.0342]
for i in range(len(parameters)):
    parameters[i] /= 0.0075
    lower[i] /= 0.0075
    upper[i] /= 0.0075
import matplotlib.pyplot as plt
plt.plot(xline, parameters, marker="*", color="black")
for i in range(len(xline)):
    plt.vlines(x=xline[i], ymin=lower[i], ymax=upper[i], label="*")
plt.vlines(x=0, ymin=-2, ymax=6.0, linestyles="dashed")
# Set vars
## Y
y = 'log_min_distance'

## X
x_list = ['ls_num', 'lti', 'ln_loanamout', 'ln_appincome', 'subprime', 'secured', \
               'cb', 'ln_ta', 'ln_emp', 'num_branch', 'ln_pop', 'density', 'hhi', 'ln_mfi',\
               'mean_distance']
x = ' + '.join(x_list)

#------------------------------------------------------------
# Run regressions
#------------------------------------------------------------

# Run Bank + msat dummies
res_msat = PanelOLS.from_formula('{} ~ {}'.format(y,x), data = df_msat).fit(cov_type = 'clustered', cluster_entity = True)

## Save output to txt
text_file = open("Results/Results_baseline_msat.txt", "w")
text_file.write(res_msat.summary.as_text())
text_file.close()

# Run Bankmsa + t dummies
res_msabank = PanelOLS.from_formula('{} ~ {}'.format(y,x), data = df_msabank).fit(cov_type = 'clustered', cluster_entity = True)

## Save output to txt
text_file = open("Results/Results_baseline_msabank.txt", "w")
text_file.write(res_msabank.summary.as_text())
text_file.close()

# Run Bank + t dummies
Example #29
0
    def fit(self,
            X,
            y,
            entity_effects=True,
            weekday_effects=True,
            cov_type='clustered'):
        """
        Parameters
        ----------
        X : Pandas DataFrame
            Panel DataFrame of entities observed at multiple points in time.
        y : str
            Column to be used as regression target.
        entity_effects : bool, default True
            If True, include entity fixed effects into the model. If False,
            the estimation procedure is equivalent to pooled OLS.
        weekday_effects : bool, default True
            If True, include a dummy for each day of the week. Due to the
            large variance in activity features between weekdays, for certain
            situations this is highly recommended.
        cov_type : str, default 'clustered'
            Covariance matrix structure. Must be one of 'clustered', 'robust'.
            Note if entity_effects is set to True, robust standard errors are
            no longer robust.
        Returns
        -------
        self.regression_results_ : linearmodels.panel.results.PanelEffectsResults
            Summary of estimation results.
        """
        self._depvar_label = ' '.join([w.capitalize() for w in y.split('_')])
        idx_cols = [self.entity_col, self.time_col]
        relative_idx = ((X[self.time_col] - X[self.event_col]) /
                        dt.timedelta(days=1)).astype(int)

        dummies = onehot_integer_series(relative_idx)
        # Add in dummy variables for observation distance to event
        X = pd.concat([X[[self.entity_col, self.time_col, y]], dummies],
                      axis=1)

        # Set our estimation target
        indvars = list(dummies.columns)

        if weekday_effects:
            X['day_of_week'] = X[self.time_col].dt.strftime('%A')
            indvars = indvars + ['day_of_week']

        X.set_index(idx_cols, inplace=True)
        X.sort_index(inplace=True)

        depvar = X[y]

        model = PanelOLS(dependent=depvar,
                         exog=X[indvars],
                         entity_effects=entity_effects)
        self.regression_results_ = model.fit(cov_type='clustered')

        # Extract point estimates
        coefs = self.regression_results_.params.reset_index()
        coefs = coefs[coefs['index'].str.contains('relative_idx')]
        coefs['index'] = coefs['index'].apply(self.parse_dummies)
        coefs.sort_values('index', inplace=True)
        self._idx_coefs = coefs.rename(columns={
            'index': 'relative_idx'
        }).set_index('relative_idx')

        # Extract integer index, we can just use the coef index since cis are the same indexing
        self._event_relative_idx = coefs['index'].values

        # Extract confidence intervals
        cis = self.regression_results_.conf_int().reset_index()
        cis = cis[cis['index'].str.contains('relative_idx')]
        cis['index'] = cis['index'].apply(self.parse_dummies)
        cis.sort_values('index', inplace=True)
        self._idx_cis = cis.rename(columns={
            'index': 'relative_idx'
        }).set_index('relative_idx')

        return self.regression_results_
Example #30
0
def estimate_profiles(graphs=False):
    """
    Function to estimate deterministic lifecycle profiles of hourly
    earnings.  Follows methodology of Fullerton and Rogers (1993).

    Args:
        graphs (bool): whether to create graphs of profiles

    Returns:
        reg_results (Pandas DataFrame): regression model coefficients
            for lifetime earnings profiles

    """
    # Read in dataframe of PSID data
    df = ogusa.utils.safe_read_pickle(
        os.path.join(cur_path, "data", "PSID", "psid_lifetime_income.pkl"))

    model_results = {
        "Names": [
            "Constant",
            "",
            "Head Age",
            "",
            "Head Age^2",
            "",
            "Head Age^3",
            "",
            "R-Squared",
            "Observations",
        ]
    }
    cats_pct = ["0-25", "26-50", "51-70", "71-80", "81-90", "91-99", "100"]
    long_model_results = {
        "Lifetime Income Group": [],
        "Constant": [],
        "Age": [],
        "Age^2": [],
        "Age^3": [],
        "Observations": [],
    }
    for i, group in enumerate(cats_pct):
        data = df[df[group] == 1].copy()
        data["ones"] = np.ones(len(data.index))
        mod = PanelOLS(data.ln_earn_rate, data[["ones", "age", "age2",
                                                "age3"]])
        res = mod.fit(cov_type="clustered", cluster_entity=True)
        # print('Summary for lifetime income group ', group)
        # print(res.summary)
        # Save model results to dictionary
        model_results[group] = [
            res.params["ones"],
            res.std_errors["ones"],
            res.params["age"],
            res.std_errors["age"],
            res.params["age2"],
            res.std_errors["age2"],
            res.params["age3"],
            res.std_errors["age3"],
            res.rsquared,
            res.nobs,
        ]
        long_model_results["Lifetime Income Group"].extend([cats_pct[i], ""])
        long_model_results["Constant"].extend(
            [res.params["ones"], res.std_errors["ones"]])
        long_model_results["Age"].extend(
            [res.params["age"], res.std_errors["age"]])
        long_model_results["Age^2"].extend(
            [res.params["age2"], res.std_errors["age2"]])
        long_model_results["Age^3"].extend(
            [res.params["age3"], res.std_errors["age3"]])
        long_model_results["Observations"].extend([res.nobs, ""])

    reg_results = pd.DataFrame.from_dict(model_results)
    reg_results.to_csv(
        os.path.join(output_dir, "DeterministicProfileRegResults.csv"))
    long_reg_results = pd.DataFrame.from_dict(model_results)
    long_reg_results.to_csv(
        os.path.join(output_dir, "DeterministicProfileRegResults_long.csv"))

    if graphs:
        # Plot lifecycles of hourly earnings from processes estimated above
        age_vec = np.arange(20, 81, step=1)
        for i, group in enumerate(cats_pct):
            earn_profile = (model_results[group][0] +
                            model_results[group][2] * age_vec +
                            model_results[group][4] * age_vec**2 +
                            model_results[group][6] * age_vec**3)
            plt.plot(age_vec, earn_profile, label=group)
        plt.title(
            "Estimated Lifecycle Earnings Profiles by Lifetime Income Group")
        plt.legend()

        plt.savefig(os.path.join(output_dir,
                                 "lifecycle_earnings_profiles.png"))

        # Plot of lifecycles of hourly earnings from processes from data
        pd.pivot_table(
            df,
            values="ln_earn_rate",
            index="age",
            columns="li_group",
            aggfunc="mean",
        ).plot(legend=True)
        plt.title(
            "Empirical Lifecycle Earnings Profiles by Lifetime Income Group")

        plt.savefig(
            os.path.join(output_dir, "lifecycle_earnings_profiles_data.png"))

        # Plot of lifecycle profiles of hours by lifetime income group
        # create variable from fraction of time endowment work
        df["labor_supply"] = df["earnhours_hh"] / (24 * 5 *
                                                   (df["married"] + 1) * 50)
        pd.pivot_table(
            df,
            values="labor_supply",
            index="age",
            columns="li_group",
            aggfunc="mean",
        ).plot(legend=True)
        plt.title("Lifecycle Profiles of Hours by Lifetime Income Group")

        plt.savefig(os.path.join(output_dir, "lifecycle_laborsupply.png"))

    return reg_results