def POLS(data, y, xs, includeFixed=False, includeTime=False): # xs_str = ' + '.join(xs) # formula = f'{y} ~ {xs_str} + 1' # print(formula) # print(data['c10'].head()) # if includeFixed: # formula += '+ EntityEffects' # if includeTime: # formula += '+ TimeEffects' # mod = PanelOLS.from_formula(formula, data=data) # if includeFixed: # ori = mod.fit(cov_type='clustered', cluster_entity=True) # else: # ori = mod.fit() # print("Formula:"+formula) # print(ori.params) # print(ori.pvalues) # print(ori.rsquared_overall) # print('\n') # data = data.dropna() print(xs) exog = sm.add_constant(data[xs]) res = PanelOLS(data[y], exog, entity_effects=includeFixed, time_effects=includeTime).fit() return res
def compproc(data, v): out = xyvars(data, v) xdata = out[0] ydata = out[1] abdata = absdiff(data, v, xdata, ydata) X = pd.get_dummies(data['round_age_x'], drop_first=True) xvar = v + '_x' X['Min_x'] = data['Min_x'] X['Min_y'] = data['Min_y'] X['traded'] = data['traded'] X = sm.add_constant(X) absstr = v + '_absdiff' mod = PanelOLS(data[absstr], X, entity_effects=True) res = mod.fit() print(res) params = res.params tradecoeff = params.loc['traded'] conf_int = res.conf_int() conf_int = conf_int.loc['traded'] lowconf = conf_int.iloc[0] upconf = conf_int.iloc[1] absstrm = v + '_absmean' absstrsd = v + '_abssd' absmean = data[absstrm].mean() abssd = data[absstrsd].mean() return ([tradecoeff, lowconf, upconf, absmean, abssd])
def kfoldfun(y, X, k): rng = np.random.RandomState(seed=12345) s = 100 seeds = np.arange(s) tot_error = 0 rng.shuffle(seeds) rsqtot = 0 for seed in seeds: cv = KFold(n_splits=k, shuffle=True, random_state=seed) for train_index, valid_index in cv.split(X, y): mod = PanelOLS(y.iloc[train_index], X.iloc[train_index], entity_effects=True) res = mod.fit(cov_type='clustered') pred = mod.predict(res.params, exog=X.iloc[valid_index]) rsq = 1 - (((y.iloc[valid_index].to_numpy() - pred.to_numpy().transpose())**2).sum()) / (( (y.iloc[valid_index].to_numpy() - y.iloc[valid_index].to_numpy().mean())**2).sum()) MSPE = np.abs((y.iloc[valid_index].to_numpy() - pred.to_numpy().transpose())).mean() tot_error = tot_error + MSPE rsqtot = rsqtot + rsq print("Mean Absolute Error:") print(tot_error / (s * k)) print("OOS R^2") print(rsqtot / (s * k))
def Reg_Painel_Efeitos_Fixos(x, y, constante="S", cov='normal'): ''' Função que calcula uma regressão de efeitos fixos, sendo, por default, computada com um intercepto e com erros padrões não robustos. **IMPORTANTE: para o painel estar arrumado, os dados devem estar multi-indexados por indíviduo e por tempo, nesta ordem. Caso contrário, transformar o dataframe usando a função 'Arrumar Painel' x: lista ou array com os valores das variáveis independentes; y: lista ou array com os valores da variável dependente; constante: "S" para regressão com intercepto e qualquer outro valor para sem intercepto. Caso em branco, a regressão é computada com intercepto; cov: "normal" para regressão com erros-padrão tradicionais (caso padrão); "robust" para erros-padrões robustos. "cluster" ou "clustered" para erros-padrões clusterizados ''' global df, Resultado # formando o vetor de variáveis independentes if constante == "S": X = sm.add_constant(x) else: X = x #Criando o Modelo levando em conta a opção dos erros padrão Modelo = PanelOLS(y, X, entity_effects=True, drop_absorbed=True) if cov == "robust": Resultado = Modelo.fit(cov_type='robust') elif cov == 'kernel': ## correlação robusta à heteroscedasticidade e autocorrelação serial Resultado = Modelo.fit(cov_type='kernel') elif cov == 'clustered' or cov == 'cluster': Resultado = Modelo.fit(cov_type='clustered', cluster_entity=True) else: Resultado = Modelo.fit() print(Resultado)
def panel_regression(self, X, y, entity_col, time_col, entity_effects=False, time_effects=False, other_effects=None, add_const=True, drop_absorbed=True): """ other_effects (array-like) – Category codes to use for any effects that are not entity or time effects. Each variable is treated as an effect return fitted res """ X = X.set_index([entity_col, time_col]) y.index = X.index if add_const: X = sm.add_constant(X) if other_effects is None: mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects)#, endog_names=['intercept'] + X.columns) else: mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects, other_effects=X[other_effects]) res = mod.fit() print(res.summary) return res
def fixedEffects( self, y, x, id, year, entity_Effects=False, time_Effects=False, cov_Type="clustered", cluster_Entity=True, clean_data="greedy", ): if type(x) != str: utterance = ( "ERROR: Multiple independent regressor approach not yet implemented." ) return utterance s = self.map_column_to_sheet(y) # prepare data v = np.copy(x) v = np.append(v, y) df = s.cleanData(v, clean_data) # set up panel and return fit df = df.set_index([id, year]) mod = PanelOLS( df[y], df[x], entity_effects=entity_Effects, time_effects=time_Effects ) utterance = ( "Here are the results of a fixed effects regression of " + str(y) + " on " + str(x) ) utterance = ( utterance + ", using " + str(year) + " as the time dimension and " + str(id) + " as the id dimension.\n\n" ) utterance = utterance + str( mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity) ) return QueryResult( mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity), utterance )
def run_regression( salesWithFlags, use_features=None, entity_effects=True, time_effects=True, cov_type="clustered", cluster_entity=True, ): """ Run a panel regression on the input sales data. Parameters ---------- salesWithFlags : pandas.DataFrame the sales data with any interaction flags already added use_features : list of str, optional if specified, only include these property characteristics in the regression entity_effects : bool, optional include neighborhood fixed effects time_effects : bool, optional include year fixed effects cov_type : str, optional the covariance type to use cluster_entity : bool, optional if using clustered errors, cluster at the neighborhood level """ from linearmodels import PanelOLS # get the modeling inputs X, Y = get_modeling_inputs(salesWithFlags, dropna=False, as_panel=True, use_features=use_features) # initialize the panel regression mod = PanelOLS(Y, X, entity_effects=entity_effects, time_effects=time_effects) # return the regression result return mod.fit(cov_type=cov_type, cluster_entity=cluster_entity)
def one_step_panel_fit(data): """ Panel regression is exactly the same as pooled regression!!! All coefficients estimation are the same """ fit = PanelOLS( data['ret'], data[[ 'const', 'market_cap', 'pe', 'pe_lyr', 'pb', 'ps', 'pcf', 'turnover' ]]).fit() logger.info("Panel Regression") logger.info(fit) resid = fit.resids logger.info("Residual auto correlation") logger.info( format_for_print( pd.DataFrame( [resid.autocorr(1), resid.autocorr(5), resid.autocorr(20)]))) return resid
], 'Single Males': [], 'Single Females': [], 'Married, Male Head': [], 'Married, Female Head': [] } for i, data in enumerate(list_of_dfs): # Note that including entity and time effects leads to a collinearity # I think this is because there are some years at begin and end of # sample with just one person # mod = PanelOLS(data.ln_wage_rate, # data[['age', 'age2', 'age3']], # weights=data.fam_smpl_wgt_core, # entity_effects=True, time_effects=True) mod = PanelOLS(data.ln_wage_rate, data[['age', 'age2', 'age3']], entity_effects=True) res = mod.fit(cov_type='clustered', cluster_entity=True) print('Summary for ', list_of_statuses[i]) print(res.summary) # Save model results to dictionary first_stage_model_results[list_of_statuses[i]] = [ res.params['age'], res.std_errors['age'], res.params['age2'], res.std_errors['age2'], res.params['age3'], res.std_errors['age3'], res.rsquared, res.nobs, res.entity_info['total'] ] fit_values = res.predict(fitted=True, effects=True, missing=True) fit_values['predictions'] = (fit_values['fitted_values'] + fit_values['estimated_effects']) list_of_dfs_with_fitted_vals.append( data.join(fit_values, how='left', on=['hh_id', 'year']))
if cluster_type in ('random', 'other-random', 'entity-nested', 'random-nested'): clusters = y.copy() if cluster_type == 'random': clusters.dataframe.iloc[:, :] = random_effects elif cluster_type == 'other-random': clusters.dataframe.iloc[:, :] = other_random elif cluster_type == 'entity_nested': eid = y.entity_ids clusters.dataframe.iloc[:, :] = eid // 3 elif cluster_type == 'random-nested': clusters.dataframe.iloc[:, :] = random_effects // 2 fo['clusters'] = clusters mod = PanelOLS(data.y, data.x, **mo) res = mod.fit(**fo) res2 = mod.fit(auto_df=False, count_effects=False, **fo) res3 = mod.fit(auto_df=False, count_effects=True, **fo) res4 = mod.fit(cov_type='unadjusted') res5 = mod.fit(cov_type='unadjusted', auto_df=False, count_effects=False) res6 = mod.fit(cov_type='unadjusted', auto_df=False, count_effects=True) vals[b] = np.column_stack([ res.params, res.std_errors, res2.std_errors, res3.std_errors, res4.std_errors, res5.std_errors, res6.std_errors ])
dfProvince['Ml2_cat1'] = dfProvince['mosquito_lag2'] < q1 dfProvince['Ml2_cat2'] = (dfProvince['mosquito_lag2'] > q1) & (dfProvince['mosquito_lag1'] < q2) dfProvince['Ml2_cat3'] = (dfProvince['mosquito_lag2'] > q2) & (dfProvince['mosquito_lag1'] < q3) dfProvince['Ml2_cat4'] = dfProvince['mosquito_lag2'] > q3 #take log of dengue and add its lag in data frame dfProvince['log_dengue'] = np.log(dfProvince['Dengue'] + 1) dfProvince['lag_log_dengue'] = dfProvince['log_dengue'].shift(1) #-------------------- Model --------------------------------------------- #first specification X_spec1 = sm.add_constant( dfProvince.loc[:, ['lag_log_dengue', 'M_cat2', 'M_cat3', 'M_cat4']]) mod1 = PanelOLS(dfProvince['log_dengue'], X_spec1, entity_effects=True) res1 = mod1.fit(cov_type='clustered') print(res1) #second specification with kfold X_spec2 = sm.add_constant(dfProvince.loc[:, [ 'M_cat2', 'M_cat3', 'M_cat4', 'Ml_cat2', 'Ml_cat3', 'Ml_cat4', 'Ml2_cat2', 'Ml2_cat3', 'Ml2_cat4' ]]) mod2 = PanelOLS(dfProvince['log_dengue'], X_spec2, entity_effects=True) res2 = mod2.fit(cov_type='clustered') print(res2) #third specification with kfold X_spec3 = sm.add_constant(dfProvince.loc[:, [
fig.show() The result of Difference-in-Differences (DID) implemented via regression is: $$\hat{Y} = 10.8-0.35d2+0.095dT-0.20(d2\cdot dT)$$ from linearmodels import PanelOLS Y = df['log_total_output_value'] df['const'] = 1 df['louis_1931'] = df['st_louis_fed']*df['year_1931'] ## Difference-in-Differences (DID) specification dd = ['const', 'st_louis_fed', 'year_1931', 'louis_1931'] dif_in_dif = PanelOLS(Y, df[dd]).fit(cov_type='clustered', cluster_entity=True) print(dif_in_dif) The St. Louis Fed policy decreased the firm revenue in 18% ($1-e^{-0.1994}$). However, the p-value is 0.1074. The result is not statistically significant at 10%. from math import exp 1 - exp(dif_in_dif.params.louis_1931 ) Somebody might argue that the difference among firms is a confound factor. One or another big firm might bias the results. This issue can be addressed by using Fixed Effects (FE) or Within Estimator. The technique is similar to the First-Difference (FD), but with different data transformation. The time-demeaned process is used to eliminate the unobserved factor $\alpha_i$. $$Y_{it}=\beta X_{it}+\alpha_i+\epsilon_{it}$$ Let's average the variables for each $i$ over time $t$:
def Panel_output(endo, exog): X = sm.add_constant(df.loc[:, exog]) mod = PanelOLS(df['log_dengue'], X, entity_effects=True) res = mod.fit(cov_type='clustered') print(res) return (res.loglik, exog, res)
col='country', hue='country', col_wrap=4, palette="deep") SRPC_other = SRPC_other.map(plt.plot, 'unemployment', 'inflation').set_titles("{col_name}") ######## e. Panel data regression analysis ####### #Panel data regression for full sample merge_eu = merge_eu.reset_index() year_full = pd.Categorical(merge_eu.year) merge_eu = merge_eu.set_index(['country', 'year']) merge_eu['year'] = year_full regression1 = PanelOLS(merge_eu.inflation, merge_eu.unemployment, entity_effects=True) res1 = regression1.fit(cov_type='clustered', cluster_entity=True) print(res1) # Panel data regression for data after QE after_QE = after_QE.reset_index() year_QE = pd.Categorical(after_QE.year) after_QE = after_QE.set_index(['country', 'year']) after_QE['year'] = year_QE regression2 = PanelOLS(after_QE.inflation, after_QE.unemployment, entity_effects=True) res2 = regression2.fit(cov_type='clustered', cluster_entity=True) print(res2)
def estimate_profiles(graphs=False): ''' Function to estimate deterministic lifecycle profiles of hourly earnings. Follows methodology of Fullerton and Rogers (1993). Args: graphs (bool): whether to create graphs of profiles Returns: reg_results (Pandas DataFrame): regression model coefficients for lifetime earnings profiles ''' # Read in dataframe of PSID data df = ogusa.utils.safe_read_pickle( os.path.join(cur_path, 'data', 'PSID', 'psid_lifetime_income.pkl')) model_results = { 'Names': [ 'Constant', '', 'Head Age', '', 'Head Age^2', '', 'Head Age^3', '', 'R-Squared', 'Observations' ] } cats_pct = ['0-25', '26-50', '51-70', '71-80', '81-90', '91-99', '100'] long_model_results = { 'Lifetime Income Group': [], 'Constant': [], 'Age': [], 'Age^2': [], 'Age^3': [], 'Observations': [] } for i, group in enumerate(cats_pct): data = df[df[group] == 1].copy() data['ones'] = np.ones(len(data.index)) mod = PanelOLS(data.ln_earn_rate, data[['ones', 'age', 'age2', 'age3']]) res = mod.fit(cov_type='clustered', cluster_entity=True) # print('Summary for lifetime income group ', group) # print(res.summary) # Save model results to dictionary model_results[group] = [ res.params['ones'], res.std_errors['ones'], res.params['age'], res.std_errors['age'], res.params['age2'], res.std_errors['age2'], res.params['age3'], res.std_errors['age3'], res.rsquared, res.nobs ] long_model_results['Lifetime Income Group'].extend([cats_pct[i], '']) long_model_results['Constant'].extend( [res.params['ones'], res.std_errors['ones']]) long_model_results['Age'].extend( [res.params['age'], res.std_errors['age']]) long_model_results['Age^2'].extend( [res.params['age2'], res.std_errors['age2']]) long_model_results['Age^3'].extend( [res.params['age3'], res.std_errors['age3']]) long_model_results['Observations'].extend([res.nobs, '']) reg_results = pd.DataFrame.from_dict(model_results) reg_results.to_csv( os.path.join(output_dir, 'DeterministicProfileRegResults.csv')) long_reg_results = pd.DataFrame.from_dict(model_results) long_reg_results.to_csv( os.path.join(output_dir, 'DeterministicProfileRegResults_long.csv')) if graphs: # Plot lifecycles of hourly earnings from processes estimated above age_vec = np.arange(20, 81, step=1) for i, group in enumerate(cats_pct): earn_profile = (model_results[group][0] + model_results[group][2] * age_vec + model_results[group][4] * age_vec**2 + model_results[group][6] * age_vec**3) plt.plot(age_vec, earn_profile, label=group) plt.title( 'Estimated Lifecycle Earnings Profiles by Lifetime Income Group') plt.legend() plt.savefig(os.path.join(output_dir, 'lifecycle_earnings_profiles.png')) # Plot of lifecycles of hourly earnings from processes from data pd.pivot_table(df, values='ln_earn_rate', index='age', columns='li_group', aggfunc='mean').plot(legend=True) plt.title( 'Empirical Lifecycle Earnings Profiles by Lifetime Income Group') plt.savefig( os.path.join(output_dir, 'lifecycle_earnings_profiles_data.png')) # Plot of lifecycle profiles of hours by lifetime income group # create variable from fraction of time endowment work df['labor_supply'] = (df['earnhours_hh'] / (24 * 5 * (df['married'] + 1) * 50)) pd.pivot_table(df, values='labor_supply', index='age', columns='li_group', aggfunc='mean').plot(legend=True) plt.title('Lifecycle Profiles of Hours by Lifetime Income Group') plt.savefig(os.path.join(output_dir, 'lifecycle_laborsupply.png')) return reg_results
], "Single Males": [], "Single Females": [], "Married, Male Head": [], "Married, Female Head": [], } for i, data in enumerate(list_of_dfs): # Note that including entity and time effects leads to a collinearity # I think this is because there are some years at begin and end of # sample with just one person # mod = PanelOLS(data.ln_wage_rate, # data[['age', 'age2', 'age3']], # weights=data.fam_smpl_wgt_core, # entity_effects=True, time_effects=True) mod = PanelOLS( data.ln_wage_rate, data[["age", "age2", "age3"]], entity_effects=True ) res = mod.fit(cov_type="clustered", cluster_entity=True) print("Summary for ", list_of_statuses[i]) print(res.summary) # Save model results to dictionary first_stage_model_results[list_of_statuses[i]] = [ res.params["age"], res.std_errors["age"], res.params["age2"], res.std_errors["age2"], res.params["age3"], res.std_errors["age3"], res.rsquared, res.nobs, res.entity_info["total"],
] interaction.append(name) for x in lang: for y in ['lang_share', 'hour_06', 'hour_12', 'hour_18']: name = x + 'X' + y twitch_panel[name] = [ xx * yy for xx, yy in zip(twitch_panel[x], twitch_panel[y]) ] interaction.append(name) for x in game: for y in [ 'game_hhi', 'adj_game_others', 'adj_game_others2', 'hour_06', 'hour_12', 'hour_18' ] + lang: name = x + 'X' + y twitch_panel[name] = [ xx * yy for xx, yy in zip(twitch_panel[x], twitch_panel[y]) ] interaction.append(name) # Run regression. dependent2 = dependent1 + interaction reg = PanelOLS(twitch_panel['view_variation'], twitch_panel[dependent2]) res = reg.fit() df = pd.DataFrame(res.params) df.to_csv('twitch_small_panel_results.txt')
## X x_list = ['ls_num', 'lti', 'ln_loanamout', 'ln_appincome', 'subprime', 'secured', \ 'cb', 'ln_ta', 'ln_emp', 'num_branch', 'ln_pop', 'density', 'hhi', 'ln_mfi',\ 'mean_distance'] x = df[x_list] ''' x_msat_list = x_list + ['dum_msat_{}'.format(i) for i in range(dum_msat.shape[1])] x_msat = sm.add_constant(df[x_msat_list]) ''' #------------------------------------------------------------ # Run regression #------------------------------------------------------------ # Run no dum res_nd = PanelOLS(y, x).fit(cov_type='clustered', cluster_entity=True) ## Save output to txt text_file = open("Results/Results_baseline_nodummy.txt", "w") text_file.write(res_nd.summary.as_text()) text_file.close() # Run dum_t res_t = PanelOLS(y, x, entity_effects=True, time_effects=True).fit(cov_type='clustered', cluster_entity=True) ## Save output to txt text_file = open("Results/Results_baseline_t.txt", "w") text_file.write(res_t.summary.as_text()) text_file.close()
from pathlib import Path import numpy as np import pandas as pd import statsmodels.api as sm from studies.age_structure.commons import * from linearmodels import PanelOLS # dcons = average daily consumption per household # rc = percent change in daily consumption per household relative to 2019m6 df = pd.read_stata("data/datareg.dta") index = ["districtnum", "month_code"] rc = df[index + ["rc"]].set_index(index) exog_cols = ["I_cat", "D_cat", "I_cat_national", "D_cat_national"] for col in exog_cols: df[col] = pd.Categorical(df[col]) exog = df[index + exog_cols].set_index(index) PanelOLS(rc, exog, entity_effects = True)
def __init__(self, GIV, exog_l=None): self.__dict__.update(GIV.__dict__) # Import all attributes from GIV # Add a new variable for the regressions self.df['Intercept'] = 1 # Name of the instrumented variable self.endog_instrument = f'{self.endog}_instrument' # Deal with exogenous variables, if any, and fit the model if exog_l: # With exogeneous variables self.exog_l = ['Intercept'] + exog_l self.mod = PanelOLS(self.df[self.endog], self.df[self.exog_l], entity_effects=True) else: # without exogeneous variables self.exog_l = ['Intercept'] self.mod = PanelOLS(self.df[self.endog], self.df[self.exog_l], entity_effects=True, time_effects=True) self.panel_res = self.mod.fit(cov_type='clustered', cluster_entity=True) print(self.panel_res.summary) # Return the residuals, by entities self.df[f'{self.endog}_residuals'] = self.panel_res.resids # Prepare the data in wide format self.dresids = self.df.pivot_table(index=[self.date_col], columns=self.ind_col, values=f'{self.endog}_residuals') dresidsc = self.dresids.dropna(axis='columns') # Balanced panel # Fit a PCA with a given number of components # TODO: choice of num factors in PCA with the variance explained resids_pca = PCA(n_components=self.pca_num_factors) resids_pca_factors = resids_pca.fit_transform(dresidsc) resids_pca_loadings = resids_pca.components_ # Varies by individuals cum_var_exp = np.cumsum(resids_pca.explained_variance_ratio_) self.cum_var = resids_pca.explained_variance_ratio_.cumsum() print(f'Cumulated explained variance with {self.pca_num_factors} ' f'factors for {self.endog}: {round(100*self.cum_var[-1], 2)} %') resids_pca_reduc = resids_pca.inverse_transform(resids_pca_factors) # Verification of the PCA inverse transform, with mean 0 residuals resids_pca_reduc2 = (resids_pca_factors.dot(resids_pca_loadings) + resids_pca.mean_) np.testing.assert_array_almost_equal(resids_pca_reduc, resids_pca_reduc2) # Compute the "pure" idiosyncratic shocks # resids_pca_reduc is common shock, each ind with different loadings d_common = pd.DataFrame(resids_pca_reduc, columns=dresidsc.columns, index=dresidsc.index) self.resids_common = d_common self.resids_idiosyncratic = dresidsc - d_common # Simple difference #### Aggregate the idiosyncratic shocks # Relative weights (time varying, but take historical largest) dwgt = self.df.groupby(self.ind_col)[self.wgt_col].mean() dwgm = dwgt.sort_values(ascending=False) # Sort from largest self.avg_weights = dwgm # Save it for plotting # Only keep the weights above a certain threhold large_l = list(dwgm[dwgm>=self.threshold].index) # In case the weights of some entities are not available avl_large_l = [x for x in large_l if x in self.resids_idiosyncratic.columns] # Give an information message if some entities are missing if len(avl_large_l) < len(large_l): missing_l = [x for x in large_l if x not in avl_large_l] print(f'Entities not available {missing_l}') # Extract the largest idiosyncratic shocks, weighted average self.large_resids = self.resids_idiosyncratic[avl_large_l] instrument = self.large_resids.dot(dwgm[avl_large_l]) # Wgt average self.instrument = pd.DataFrame(instrument, index=instrument.index, columns=[f'{self.endog}_instrument']) # Class attributes (attributes based on a class below) self.plot = GIVPlot(self)
data2 = pd.read_csv("Problem3.csv") rows=0 for pid in range(1,48): for y in range(startyear,endyear+1): local_data3=data1[data1['pref_id']==pid] data2['mr'].iloc[rows]=local_data3['mr'+str(y)].iloc[0] rows=rows+1 data2.to_csv("Problem3.csv") ''' data = pd.read_csv("Problem3&4.csv") year = pd.Categorical(data.year) data = data.set_index(['pref_id','year']) data['year']=year #Individual&Time FE exog_vars = ['temperature','temperature2','temperature3','prec','year'] exog_vars2 = ['temperature'] exog = sm.add_constant(data[exog_vars]) exog2 =sm.add_constant(data[exog_vars2]) mod2=sm.OLS(data.mr,exog2) res2=mod2.fit() mod = PanelOLS(data.mr,exog,entity_effects=True) res = mod.fit(cov_type='clustered',cluster_entity=True) #The result of Model 1 print(res2.summary()) #The result of Model 5 print(res)
#%% import numpy as np from statsmodels.datasets import grunfeld data = grunfeld.load_pandas().data data.year = data.year.astype(np.int64) # MultiIndex, entity - time data = data.set_index(['firm', 'year']) from linearmodels import PanelOLS mod = PanelOLS(data.invest, data[['value', 'capital']], entity_effects=True) res = mod.fit(cov_type='clustered', cluster_entity=True) #%% from linearmodels import PanelOLS mod = PanelOLS.from_formula('invest ~ value + capital + EntityEffects', data) res = mod.fit(cov_type='clustered', cluster_entity=True) #%%
mod = RandomEffects( data.y_light, data[[ 'intercept', 'x1_light', 'x2_light', 'x3_light', 'x4_light', 'x5_light' ]]) res = mod.fit() import numpy as np from statsmodels.datasets import grunfeld data = grunfeld.load_pandas().data data.year = data.year.astype(np.int64) from linearmodels import PanelOLS etdata = data.set_index(['firm', 'year']) mod2 = PanelOLS(etdata.invest, etdata[['value', 'capital']], entity_effect=True) res2 = mod2.fit(debiased=True) import numpy as np x = np.random.randn(10, 2) p = np.zeros((10, 10)) p[:5, :5] = 1 / 5 p[5:, 5:] = 1 / 5 z = np.zeros((10, 2)) z[:5, 0] = 1 z[5:, 1] = 1 a = x.T @ p @ x b = (x.T @ z) @ (x.T @ z).T
def regression_fun(start_it): #print(start_it,"\n") name_v1=iteration_list[start_it][2-1]+"PREq"+iteration_list[start_it][1-1] name_v2=iteration_list[start_it][3-1]+"PREq"+iteration_list[start_it][1-1] name_v3=iteration_list[start_it][4-1]+"PREq"+iteration_list[start_it][1-1] name_v4=iteration_list[start_it][5-1]+"PREq"+iteration_list[start_it][1-1] cname_v1="C"+name_v1 cname_v2="C"+name_v2 cname_v3="C"+name_v3 cname_v4="C"+name_v4 df=main_data[(main_data[name_v1]==main_data[cname_v1]) | (main_data["treated"]==1)] df=df[(df[name_v2]==df[cname_v2]) | (df["treated"]==1)] df=df[(df[name_v3]==df[cname_v3]) | (df["treated"]==1)] df=df[(df[name_v4]==df[cname_v4]) | (df["treated"]==1)] groupnum = pd.DataFrame(df.groupby(['matchid']).size(),columns=["nummatchid"]) df=pd.merge(df,groupnum,how="inner",on='matchid') df=df[df["nummatchid"]!=15] df=df.sort_values(by=["matchid","treated","Ceucli0300_3","year"],ascending=[True,False,True,True]) df["NB"]=df["treated"].groupby(df["matchid"]).rank() df=df[df["NB"]<=90] file_path_noweight="results3/Q_"+iteration_list[start_it][1-1]+"_"+iteration_list[start_it][2-1]+"_"+iteration_list[start_it][3-1]+"_"+iteration_list[start_it][4-1]+"_"+iteration_list[start_it][5-1]+"_noweight.pkl" file_path_emp2000weight="results3/Q_"+iteration_list[start_it][1-1]+"_"+iteration_list[start_it][2-1]+"_"+iteration_list[start_it][3-1]+"_"+iteration_list[start_it][4-1]+"_"+iteration_list[start_it][5-1]+"_emp2000weight.pkl" file_path_pop2000weight="results3/Q_"+iteration_list[start_it][1-1]+"_"+iteration_list[start_it][2-1]+"_"+iteration_list[start_it][3-1]+"_"+iteration_list[start_it][4-1]+"_"+iteration_list[start_it][5-1]+"_pop2000weight.pkl" #matchyr=pd.Categorical(df.matchyr) df=df.set_index(["matchyr","amc"]) #df['matchyr']=matchyr exog_vars=['shock'] exog=sm.add_constant(df[exog_vars]) try: #mod_no_w=[] res=[] mod_no_w=PanelOLS(df.lemp,exog,entity_effects=True) res=mod_no_w.fit() mod_no_w_coeff=pd.DataFrame([iteration_list[start_it][1]+"-"+iteration_list[start_it][2]+"-"+iteration_list[start_it][3]+"-"+iteration_list[start_it][4],res._params[1],res.std_errors._values[1],len(res.resids.T),df['Ntreated'].sum(),"no-weight"]) mod_no_w_coeff.to_pickle(file_path_noweight) #del mod_no_w._w #del mod_no_w._x #del mod_no_w._y #mod_no_w.weight_add="no-weight" #mod_no_w.Nb_Treated=df['Ntreated'].sum() # 1:Name: # 2:Coefficient: res._params[1] # 3:Standarad_Error: res.std_errors._values[1] # 4:Obs: len(res.resids.T) # 5:Nb Treated: df['Ntreated'].sum() # 6:Weight except Exception as e: #print(e) pass try: res=[] mod_emp2000=PanelOLS(df.lemp,exog,weights=abs(df.emp2000)+1e-6,entity_effects=True) res=mod_emp2000.fit() mod_emp2000_coeff=pd.DataFrame([iteration_list[start_it][1]+"-"+iteration_list[start_it][2]+"-"+iteration_list[start_it][3]+"-"+iteration_list[start_it][4],res._params[1],res.std_errors._values[1],len(res.resids.T),df['Ntreated'].sum(),"no-weight"]) mod_emp2000_coeff.to_pickle(file_path_emp2000weight) #del mod_emp2000._w #del mod_emp2000._x #del mod_emp2000._y #mod_emp2000.weight_add="emp-2000" #mod_emp2000.Nb_Treated=df['Ntreated'].sum() except Exception as e: #print(e) pass try: res=[] mod_pop2000=PanelOLS(df.lemp,exog,weights=abs(df.emp2000)+1e-6,entity_effects=True) res=mod_pop2000.fit() mod_pop2000_coeff=pd.DataFrame([iteration_list[start_it][1]+"-"+iteration_list[start_it][2]+"-"+iteration_list[start_it][3]+"-"+iteration_list[start_it][4],res._params[1],res.std_errors._values[1],len(res.resids.T),df['Ntreated'].sum(),"no-weight"]) mod_pop2000_coeff.to_pickle(file_path_pop2000weight) #mod_pop2000=PanelOLS(df.lemp,exog,weights=abs(df.pop2000)+1e-6,entity_effects=True) #del mod_pop2000._w #del mod_pop2000._x #del mod_pop2000._y #mod_pop2000.weight_add="pop-2000" #mod_pop2000.Nb_Treated=df['Ntreated'].sum() except Exception as e: #print(e) pass
import os from statsmodels.iolib.summary2 import summary_col import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm from linearmodels import PanelOLS from linearmodels import RandomEffects if __name__ == "__main__": REG_DATA = sys.argv[1] RES3_PATH = sys.argv[2] metadata = pd.read_csv(REG_DATA) metadata = metadata.sort_values(by=['Code', 'Year']) metadata = metadata.set_index(['Code', 'Year']) metadata['Income_t0_log'] = np.log10(metadata['Income_t0']) base = os.path.basename(RES3_PATH) incomegroup = base.split(".")[0].split("_")[-1] metadata = metadata[metadata.IncomeGroup == incomegroup] metadata = metadata.dropna() num_period = len(metadata['period'].unique()) metadata = metadata[metadata['size'] == num_period] exog_vars = ['ECI', 'Income_t0_log', 'diversity'] exog = sm.add_constant(metadata[exog_vars]) mod = PanelOLS(metadata.growth, exog, entity_effects=True) with open(RES3_PATH, 'w') as f: f.write(mod.fit().summary.as_text())
from matplotlib.backends.backend_pdf import PdfPages from linearmodels import PanelOLS #import data data = pd.DataFrame.from_csv("fraserDataWithRGDPPC.csv", index_col=[0, 1], parse_dates=True) # create list of each index set from multi index years = list(sorted(set(data.index.get_level_values('Year')))) country = list(sorted(set(data.index.get_level_values('ISO_Code')))) #choose variables that will be plotted for each year in scatter plot_vars = [ "Sound Money", "Government Consumption", "RGDP Per Capita", "Quartile" ] # Normalize income so that 1 represents the maximum value of RGDP Per Capita # This will allow dot to be easily adjusted data["RGDP Per Capita"] = data["RGDP Per Capita"] / max( data["RGDP Per Capita"]) * 1000 # Panel OLS reg_data = data[[ "RGDP Per Capita", "Sound Money", "Government Consumption", "SUMMARY INDEX" ]].dropna() x = reg_data[["Sound Money", "Government Consumption", "SUMMARY INDEX"]] y = reg_data[["RGDP Per Capita"]] mod = PanelOLS(y, x, entity_effects=True, time_effects=False) res = mod.fit(cov_type='clustered', cluster_entity=True) print(res.summary)
merge_tot = merge_tot.set_index(["省份", "日期"]) merge_norm = (merge_tot - merge_tot.mean()) / (merge_tot.max() - merge_tot.min()) print(merge_norm) from linearmodels import PanelOLS y = merge_norm[["新增确诊"]] x = merge_norm[[ "_1traffic", "_2traffic", "_3traffic", "traffic", "traffic3_", "traffic2_", "traffic1_", ]] #change here reg = PanelOLS(y, x, entity_effects=True, time_effects=True) res = reg.fit(cov_type='clustered', cluster_entity=True) print(res) parameters = [0.0433, 0.0231, 0.0075, 0.0176, 0.0053, 0.0034, 0.0086] xline = [-3, -2, -1, 0, 1, 2, 3] lower = [-0.0151, 0.0040, 0.0075, 0.0007, -0.0108, -0.0162, -0.017] upper = [0.1017, 0.0422, 0.0075, 0.0346, 0.0214, 0.0229, 0.0342] for i in range(len(parameters)): parameters[i] /= 0.0075 lower[i] /= 0.0075 upper[i] /= 0.0075 import matplotlib.pyplot as plt plt.plot(xline, parameters, marker="*", color="black") for i in range(len(xline)): plt.vlines(x=xline[i], ymin=lower[i], ymax=upper[i], label="*") plt.vlines(x=0, ymin=-2, ymax=6.0, linestyles="dashed")
def estimate_profiles(graphs=False): """ Function to estimate deterministic lifecycle profiles of hourly earnings. Follows methodology of Fullerton and Rogers (1993). Args: graphs (bool): whether to create graphs of profiles Returns: reg_results (Pandas DataFrame): regression model coefficients for lifetime earnings profiles """ # Read in dataframe of PSID data df = ogusa.utils.safe_read_pickle( os.path.join(cur_path, "data", "PSID", "psid_lifetime_income.pkl")) model_results = { "Names": [ "Constant", "", "Head Age", "", "Head Age^2", "", "Head Age^3", "", "R-Squared", "Observations", ] } cats_pct = ["0-25", "26-50", "51-70", "71-80", "81-90", "91-99", "100"] long_model_results = { "Lifetime Income Group": [], "Constant": [], "Age": [], "Age^2": [], "Age^3": [], "Observations": [], } for i, group in enumerate(cats_pct): data = df[df[group] == 1].copy() data["ones"] = np.ones(len(data.index)) mod = PanelOLS(data.ln_earn_rate, data[["ones", "age", "age2", "age3"]]) res = mod.fit(cov_type="clustered", cluster_entity=True) # print('Summary for lifetime income group ', group) # print(res.summary) # Save model results to dictionary model_results[group] = [ res.params["ones"], res.std_errors["ones"], res.params["age"], res.std_errors["age"], res.params["age2"], res.std_errors["age2"], res.params["age3"], res.std_errors["age3"], res.rsquared, res.nobs, ] long_model_results["Lifetime Income Group"].extend([cats_pct[i], ""]) long_model_results["Constant"].extend( [res.params["ones"], res.std_errors["ones"]]) long_model_results["Age"].extend( [res.params["age"], res.std_errors["age"]]) long_model_results["Age^2"].extend( [res.params["age2"], res.std_errors["age2"]]) long_model_results["Age^3"].extend( [res.params["age3"], res.std_errors["age3"]]) long_model_results["Observations"].extend([res.nobs, ""]) reg_results = pd.DataFrame.from_dict(model_results) reg_results.to_csv( os.path.join(output_dir, "DeterministicProfileRegResults.csv")) long_reg_results = pd.DataFrame.from_dict(model_results) long_reg_results.to_csv( os.path.join(output_dir, "DeterministicProfileRegResults_long.csv")) if graphs: # Plot lifecycles of hourly earnings from processes estimated above age_vec = np.arange(20, 81, step=1) for i, group in enumerate(cats_pct): earn_profile = (model_results[group][0] + model_results[group][2] * age_vec + model_results[group][4] * age_vec**2 + model_results[group][6] * age_vec**3) plt.plot(age_vec, earn_profile, label=group) plt.title( "Estimated Lifecycle Earnings Profiles by Lifetime Income Group") plt.legend() plt.savefig(os.path.join(output_dir, "lifecycle_earnings_profiles.png")) # Plot of lifecycles of hourly earnings from processes from data pd.pivot_table( df, values="ln_earn_rate", index="age", columns="li_group", aggfunc="mean", ).plot(legend=True) plt.title( "Empirical Lifecycle Earnings Profiles by Lifetime Income Group") plt.savefig( os.path.join(output_dir, "lifecycle_earnings_profiles_data.png")) # Plot of lifecycle profiles of hours by lifetime income group # create variable from fraction of time endowment work df["labor_supply"] = df["earnhours_hh"] / (24 * 5 * (df["married"] + 1) * 50) pd.pivot_table( df, values="labor_supply", index="age", columns="li_group", aggfunc="mean", ).plot(legend=True) plt.title("Lifecycle Profiles of Hours by Lifetime Income Group") plt.savefig(os.path.join(output_dir, "lifecycle_laborsupply.png")) return reg_results
def fit(self, X, y, entity_effects=True, weekday_effects=True, cov_type='clustered'): """ Parameters ---------- X : Pandas DataFrame Panel DataFrame of entities observed at multiple points in time. y : str Column to be used as regression target. entity_effects : bool, default True If True, include entity fixed effects into the model. If False, the estimation procedure is equivalent to pooled OLS. weekday_effects : bool, default True If True, include a dummy for each day of the week. Due to the large variance in activity features between weekdays, for certain situations this is highly recommended. cov_type : str, default 'clustered' Covariance matrix structure. Must be one of 'clustered', 'robust'. Note if entity_effects is set to True, robust standard errors are no longer robust. Returns ------- self.regression_results_ : linearmodels.panel.results.PanelEffectsResults Summary of estimation results. """ self._depvar_label = ' '.join([w.capitalize() for w in y.split('_')]) idx_cols = [self.entity_col, self.time_col] relative_idx = ((X[self.time_col] - X[self.event_col]) / dt.timedelta(days=1)).astype(int) dummies = onehot_integer_series(relative_idx) # Add in dummy variables for observation distance to event X = pd.concat([X[[self.entity_col, self.time_col, y]], dummies], axis=1) # Set our estimation target indvars = list(dummies.columns) if weekday_effects: X['day_of_week'] = X[self.time_col].dt.strftime('%A') indvars = indvars + ['day_of_week'] X.set_index(idx_cols, inplace=True) X.sort_index(inplace=True) depvar = X[y] model = PanelOLS(dependent=depvar, exog=X[indvars], entity_effects=entity_effects) self.regression_results_ = model.fit(cov_type='clustered') # Extract point estimates coefs = self.regression_results_.params.reset_index() coefs = coefs[coefs['index'].str.contains('relative_idx')] coefs['index'] = coefs['index'].apply(self.parse_dummies) coefs.sort_values('index', inplace=True) self._idx_coefs = coefs.rename(columns={ 'index': 'relative_idx' }).set_index('relative_idx') # Extract integer index, we can just use the coef index since cis are the same indexing self._event_relative_idx = coefs['index'].values # Extract confidence intervals cis = self.regression_results_.conf_int().reset_index() cis = cis[cis['index'].str.contains('relative_idx')] cis['index'] = cis['index'].apply(self.parse_dummies) cis.sort_values('index', inplace=True) self._idx_cis = cis.rename(columns={ 'index': 'relative_idx' }).set_index('relative_idx') return self.regression_results_
#------------------------------------------------------------ # Set Variables #------------------------------------------------------------ # Set vars ## Y y = df['log_min_distance'] ## X x_list = ['ls_num', 'lti', 'ln_loanamout', 'ln_appincome', 'subprime', 'secured', \ 'cb', 'ln_ta', 'ln_emp', 'num_branch', 'ln_pop', 'density', 'hhi', 'ln_mfi',\ 'mean_distance', 'dum_msat'] x = sm.add_constant(df[x_list]) ''' OLD x_msat_list = x_list + ['dum_msat_{}'.format(i) for i in range(dum_msat.shape[1])] x_msat = sm.add_constant(df[x_msat_list]) ''' #------------------------------------------------------------ # Run regression #------------------------------------------------------------ # Run no dum res = PanelOLS(y, x, entity_effects=True).fit(cov_type='clustered', cluster_entity=True) ## Save output to txt text_file = open("Results_baseline_msat.txt", "w") text_file.write(res.summary.as_text()) text_file.close()