Ejemplo n.º 1
0
def white_test(data: pd.DataFrame, window: int = 21):
    data['std1'] = data['price'].rolling(21).std()
    data.dropna(inplace= True)
    X = smi.tools.tools.add_constant(data['price'])
    results = smi.regression.linear_model.OLS(data['std1'], X).fit()
    resid = results.resid
    exog = results.model.exog
    print("White-Test p-Value: {0}".format(sm.het_white(resid, exog)[1]))
    if sm.het_white(resid, exog)[1] > 0.05:
        print("White test outcome at 5% signficance: homoscedastic")
    else:
        print("White test outcome at 5% signficance: heteroscedastic")
Ejemplo n.º 2
0
def test_heteroscedacity(y, y_pred, pred_value_only=1):
    ss = """
       Test  Heteroscedacity :  Residual**2  = Linear(X, Pred, Pred**2)
       F pvalues < 0.01 : Null is Rejected  ---> Not Homoscedastic
       het_breuschpagan
    
    """
    from statsmodels.stats.diagnostic import het_breuschpagan, het_white
    error = y_pred - y

    ypred_df = pd.DataFrame({
        "pcst": [1.0] * len(y),
        "pred": y_pred,
        "pred2": y_pred * y_pred
    })
    labels = [
        "LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"
    ]
    test1 = het_breuschpagan(error * error, ypred_df.values)
    test2 = het_white(error * error, ypred_df.values)
    ddict = {
        "het-breuschpagan": dict(zip(labels, test1)),
        "het-white": dict(zip(labels, test2)),
    }

    return ddict
Ejemplo n.º 3
0
    def test_het_white(self):
        res = self.res

        # TODO: regressiontest, compare with Greene or Gretl or Stata
        hw = smsdia.het_white(res.resid, res.model.exog)
        hw_values = (33.503722896538441, 2.9887960597830259e-06, 7.7945101228430946, 1.0354575277704231e-06)
        assert_almost_equal(hw, hw_values)
Ejemplo n.º 4
0
def f_heter_w(datos):
    """
    Prueba White de heterocedasticidad

    Parameters
    ----------
    datos : pd.DataFrame : con información contenida en archivo leido

    Returns
    -------
    dict : Valores de heterocedasticidad y p-value

    """
    datos = datos.set_index('datetime')
    datos_dif = datos - datos.shift()
    datos_dif.dropna(inplace=True)
    datos_dif = datos_dif.reset_index()
    serie = datos_dif['actual']
    indxx = datos_dif.index
    het_model_w = sm.OLS(serie, sm.add_constant(indxx)).fit()
    resids = het_model_w.resid
    white_het = smd.het_white(resids, het_model_w.model.exog)
    lm_stat_w = white_het[0]
    pvalue_lm_w = white_het[1]
    f_stat_w = white_het[2]
    pvalue_f_w = white_het[3]
    heteroscedastico_w = 'Si' if pvalue_f_w < et.alpha else 'No'
    return {
        'Lagrange Multiplier Value': lm_stat_w,
        'LM P-value': pvalue_lm_w,
        'Statistic Value': f_stat_w,
        'F-Statistic P-Value': pvalue_f_w,
        '¿Heteroscedástico?': heteroscedastico_w
    }
Ejemplo n.º 5
0
    def test_het_white(self):
        res = self.res

        #TODO: regressiontest, compare with Greene or Gretl or Stata
        hw = smsdia.het_white(res.resid, res.model.exog)
        hw_values = (33.503722896538441, 2.9887960597830259e-06,
                     7.7945101228430946, 1.0354575277704231e-06)
        assert_almost_equal(hw, hw_values)
Ejemplo n.º 6
0
Archivo: main.py Proyecto: NREL/EVOLVE
    def check_heteroskedasticity(self):

        white_test = het_white(self.result.resid,  [self.dataformodel['Temperature']])

        #bp_test = het_breuschpagan(self.result.resid, self.result.model.exog)

        labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic’', 'F-Test p-value']
        #self.logger.info(dict(zip(labels, bp_test)))
        self.logger.info(dict(zip(labels, white_test)))
Ejemplo n.º 7
0
    def _test_gp_residuals(self, conf_mat):
        #Test normal
        k2, p_norm = stats.normaltest(self.data['GP_residuals'])
        if p_norm < 0.05:
            warnings.warn("The residuals are not Gaussian!")

        # Test heteroskedasticity
        exog = add_constant(conf_mat)
        _, p_het, _, _ = het_white((self.data['GP_residuals'])**2, exog)
        if p_het < 0.05:
            warnings.warn("The residuals are heteroskedastic!")
Ejemplo n.º 8
0
 def coef_p_rsqr_white_values(self, results):
     '''
     对参数字典进行数据填充
     '''
     white = het_white(results.resid, exog=results.model.exog)[1]
     self.para_dict["f_p"] = results.f_pvalue
     self.para_dict["const_coef"] = results.params[0]
     self.para_dict["x1_coef"] = results.params[1]
     self.para_dict["const_p"] = results.pvalues[0]
     self.para_dict["x1_p"] = results.pvalues[1]
     self.para_dict["r_squared"] = results.rsquared_adj
     self.para_dict["white_p"] = white
Ejemplo n.º 9
0
def assess_heteroskedasticity(df, col):
    """ Assess heteroskedasticity between selected column and Transit Score using the White test """

    f = '{}~TRANSIT_SCORE'.format(col)  # R-like formula for statsmodels
    model = ols(formula=f, data=df).fit()
    white_test = het_white(model.resid, model.model.exog)
    labels = [
        'LM Statistic', 'LM - Test p - value', 'F - Statistic',
        'F - Test p - value'
    ]
    print(dict(zip(labels, white_test)))
    print("\n")
Ejemplo n.º 10
0
def het_white(y_true: pd.Series, resid: pd.Series):
    """
    White's Lagrange Multiplier Test for Heteroscedasticity.
    
    Null hypothesis:
    No heteroscedasticity for y_pred with respect to y_true.
    
    Note: This does not imply no serial correlation.
    
    Alternative hypothesis:
    heteroscedasticity exist for y_pred with respect to y_true.

    References: 
    * https://www.mathworks.com/help/econ/archtest.html
    * https://www.mathworks.com/help/econ/engles-arch-test.html

    Definition: Heteroscedasticity :=

    Heteroscedasticity means that the variance of a time series
    is not constant over time.  Therefore the variance over sliding
    window t,... t+i will differ from sliding window t+i+1,..t+j,
    where t is the initial time index, i, j are integers.
    
    Parameters
    ----------
    y_true : pd.Series
        observed values
    resid : pd.Series
        y_pred - y_true values
    nlags : int, default None
        Highest lag to use.
    ddof : int, default 0
        If the residuals are from a regression, or ARMA estimation, then there
        are recommendations to correct the degrees of freedom by the number
        of parameters that have been estimated, for example ddof=p+q for an
        ARMA(p,q).

    Returns
    -------
    lm : float
        Lagrange multiplier test statistic
    lmpval : float
        p-value for Lagrange multiplier test
    """
    result = diagnostic.het_white(resid, y_true)
    HetWhiteResult = namedtuple('HetWhiteResult', 'statistic pvalue')
    return HetWhiteResult(result[0], result[1])
Ejemplo n.º 11
0
def white_test(regression_dict, results_dict):
    data = dataset[regression_dict["Model 8"]]
    data = data.replace([np.inf, -np.inf], np.nan)
    data = data.dropna()
    Xes = [
        'Block_3', 'Block_5', 'Block_10', 'Block_20', 'Block_30', 'GROUP_A',
        'GROUP_F', 'GROUP_S', 'Leverage_T-1', 'SalesGrowth',
        'MarketToBookValue_T-1', 'ReturnOnAssets_T-1', 'LOG(MarketCap_T-1)',
        'LOG(NetSales_T-1)', 'LOG(TurnoverByVolume)'
    ]
    for exog in Xes:
        white = smd.het_white(results_dict["Model 8"].resid,
                              statsmodels.tools.tools.add_constant(data[exog]))
        if white[1] > 0.05:
            print("Factor: {}\t--> Homoscedasticity\t :)".format(exog))
        else:
            print("Factor: {}\t--> HETEROSCEDASTICITY :(".format(exog))
Ejemplo n.º 12
0
def White_het(r, e):
    '''White test for heteroskedasticity

    Keyword arguments:
    r -- the array part (Result model of ols)
    e -- the array part (Resids of this model)
    '''

    hw = het_white(e, r.model.exog)  #White test
    tblv = stats.chi2.ppf(0.95, (len(e) - 2))  #Critical value of chi squared

    if (hw[1] > 0.05):
        print('White Test[+]: Model has heteroskedacity\nPvalue Wtest = ' +
              str(hw[1]) + ' > 0.05\n')
        return '+'
    else:
        print('White Test[-]: Model has no heteroskedacity\nWtest = ' +
              str(hw[1]) + ' < 0.05\n')
        return '-'
Ejemplo n.º 13
0
def heteroskedasticity_test(X, form):

    # Realiza un test de White y un test de Breusch-Pagan para determinar la heterocedasticidad del modelo.

    # Parámetros:
    #       X (DataFrame): Dataframe con las variables exógenas.
    #       form (str): String con la fórmula a  utilizar para la regresión.

    # Devuelve:
    #       Nada.

    expenditure_model = ols(formula=form, data=X).fit()
    labels = [
        'LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value'
    ]
    white_test = het_white(expenditure_model.resid,
                           expenditure_model.model.exog)
    bp_test = het_breuschpagan(expenditure_model.resid,
                               expenditure_model.model.exog)
    print("---Test de White---")
    print(dict(zip(labels, white_test)))

    print("---Test de BP---")
    print(dict(zip(labels, bp_test)))
Ejemplo n.º 14
0
def error_analisis(result, plot=False):
    '''
    Inputs:
        result: Results from Stats after model.fit()
        plot: True if we want a plot
    Returns:
        Print of an statistics analysis of regression errors which includes: Autocorrelation, \
        Heterokedasticity, Stationarity and Normality
    '''
    #Autocorrleation
    print('----------Durbin Watson-------------')
    out = durbin_watson(result.resid)
    print('Durbin Watson is: ' + str(out))

    if plot:
        qqplot(result.resid, line='s')
        pyplot.show()

    print('--------Breusch Autocorr-----------')

    try:
        bre = acorr_breusch_godfrey(result, nlags=12)

        print('lm: ' + str(bre[0]))
        print('lmpval: ' + str(bre[1]))
        print('fval: ' + str(bre[2]))
        print('fpval: ' + str(bre[3]))

        if bre[1] < 0.05:
            print('Evidence for autocorrelation')
        else:
            print('Not Evidence for autocorrelation')
    except:
        print('Cant calculate statistic')

    print('-----White Heteroskedasticity------')

    white_test = het_white(result.resid, result.model.exog)

    labels = [
        'LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value'
    ]
    print(dict(zip(labels, white_test)))

    if white_test[1] < 0.05:
        print('Evidence for heteroskedasticity')
    else:
        print('Not Evidence for heteroskedasticity')

    print('----------ADF Test-----------------')
    try:
        DFtest(result.resid)
    except:
        print("Can't calculate ADF test")

    print('----------Shapiro Normality--------')
    stat, p = shapiro(result.resid)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')

    if plot:
        residuals = pd.DataFrame(result.resid)
        plt.show()
        residuals.plot(kind='kde')
        plt.show()
Ejemplo n.º 15
0
# Check
stat2, p2 = ttest_ind(sep_ret["mktret"], no_sep_ret["mktret"])
print("p-value={}".format(p2 / 2))

#%%
#• an OLS regression of monthly S&P500 returns as dependent variable and two dummy independent variables

mkt["djan"] = np.where(mkt["Date"].dt.month == 1, 1, 0)
mkt["dsep"] = np.where(mkt["Date"].dt.month == 9, 1, 0)

# 5. OLS regression of Citigroup returns against S&P500 returns
mktreg = smf.ols(formula='mktret ~ djan + dsep', data=mkt).fit()
print(mktreg.summary(), '\n' * 5)

#White test for heteroskedasticity
het_test = sd.het_white(mktreg.resid, mktreg.model.exog)
labels = ['LM-Statistic', 'LM p-value', 'F-Statistic', 'F-Test p-value']
print('White test for heteroskedasticity:\n', pd.DataFrame([labels, het_test]),
      '\n' * 5)

#OLS regression with heteroskedasticity-consistent std errors (HC3)
mktreg_het = smf.ols(formula='mktret ~ djan + dsep',
                     data=mkt).fit(cov_type='HC3')
print(
    'OLS regression of mktret against dummy variables under heteroskedasticity-consistent std errors:\n',
    mktreg_het.summary(), '\n' * 5)


# Define a function to run OLS regression with autocorrelation-consistent std errors
#    with various lags (HAC)
def mktreg_autocor(nlags):
version = sys.argv[5]
output_file = sys.argv[6]

np.random.seed(seed)

Y, G, betas, mafs, G_raw = generate_null_eqtl_factorization_data_with_no_sample_repeat_and_first_test_has_heteroskedasticity(
    num_samples, num_tests, af, version)
Y_resid, beta_estimated, corrz = get_residuals(Y, G)
Y_pred = Y - Y_resid

heteroskedastic_pvalz = []

for test_iter in range(Y_pred.shape[1]):
    exog = np.hstack((np.ones((len(Y_pred[:, test_iter]), 1)),
                      np.transpose(np.asmatrix(Y_pred[:, test_iter]))))
    _, pval, __, f_pval = diagnostic.het_white(Y_resid[:, test_iter], exog)
    heteroskedastic_pvalz.append(f_pval)
heteroskedastic_pvalz = np.asarray(heteroskedastic_pvalz)

# Dummy variable
Z = np.ones(num_samples)
cov = np.ones((num_samples, 1))

eqtl_vi = eqtl_factorization_ard_no_alpha.EQTL_FACTORIZATION_VI(K=3,
                                                                alpha=1e-3,
                                                                beta=1e-3,
                                                                a=1,
                                                                b=1,
                                                                max_iter=20,
                                                                gamma_v=1)
eqtl_vi.fit(G=G, Y=Y, z=Z, cov=cov)
Ejemplo n.º 17
0
    name_jb[1],
    ' ',
    test_jb[1],
    ' ',
    name_jb[2],
    ' ',
    test_jb[2],
    ' ',
    name_jb[3],
    ' ',
    test_jb[3],
)
print('     ')
# Lets try some more test_result  --- TEST FOR HETEROSCEDASTICITY "WHITE's test"
print('WHITE TEST')
white_t = ssd.het_white(model.resid, model.model.exog, retres=False)
print(white_t[1])
stat_1 = white_t[1]
print('Whites test for hetero using lm test: ', stat_1,
      'Whites test for hetero using F-Stat :', white_t[3])
print(' ')

#Next test :ARCH For heteroscedasticity
print('ARCH TEST')
arch = ssd.het_arch(model.resid,
                    maxlag=2,
                    autolag=None,
                    store=False,
                    regresults=False,
                    ddof=0)
print('LM test stat:', arch[0], 'LM P value:', arch[1], 'F test stat:',
Ejemplo n.º 18
0

print("e. Regress log-wages on education, age, age2, and the gender and racial indicators, using the “robust” option in STATA to calculate the Eicker-White consistent standard errors. Explain briefly how these estimates of the standard errors are corrected for heteroskedasticity. How do they compare to the “uncorrected” (conventional) least squares estimates of the standard errors. Is there any evidence of heteroskedasticity?\n\n\n")

lm_lw = smf.ols(formula='lwage ~ educ + age + age2 + female + white', data = df).fit()
print(lm_lw.summary())
print("\n\n\n normal standard errors")
print(lm_lw.bse)
print("\n\n\n white standard errors")
print(lm_lw.HC0_se)

import statsmodels.stats.diagnostic as ssd

df['intercept']=1
X = df[['educ','intercept','age','age2','female','white']]
white = ssd.het_white(lm_lw.resid, X, retres=False)[3]

print("\n\n\n p-value of the f-statistic of the hypothesis that the error variance does not depend on x: ")
print(white)

print("\n\n\n Answer: White standard errors are slightly larger for intercept, educ, age, and age2. They are slightly smaller for female and white. Estimates of the standard errors are corrected for heteroskedasticity by allowing them to vary with x values. The White test shows evidence of heteroskedasticity. The p-value of the f-statistic of the hypothesis that the error variance does not depend on x is .0004.\n\n\n")

print("f. Using the “predict” STATA command [predict (var. name), residual], save the residuals from both the wage and log-wage regressions. Now regress the squared values of the residuals from the two sets of regressions on education, age, age2, female, and white. From the R-squareds of these regressions, test for heteroskedasticity in the two sets of residuals. Does one set of residuals appear to be more heteroskedastic than the other?\n\n\n")

df['res_lwage2'] = lm_lw.resid**2
df['res_lwage'] = lm_lw.resid

# Scatterplot 
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(df.educ, df.res_lwage2)
Ejemplo n.º 19
0
 total_loading = total_regression.Get_total_loading(ret, tech_data, rm,
                                                    macro_data,
                                                    finance_loading[2015],
                                                    dummy)
 total_loading = pd.DataFrame(total_loading)
 X = total_loading.iloc[:, 0]
 X = sm.add_constant(X)
 count = 0
 number_of_het = 0
 het_pvalue = np.zeros([132, 1])
 for i in range(132):
     temp_Y = Y.iloc[:, i]
     temp_X = X.iloc[:, (0, 1)]
     model = sm.OLS(temp_Y, temp_X).fit()
     temp_resid = model.resid
     test_temp = ss.het_white(temp_resid, temp_X)
     het_pvalue[count, :] = test_temp[1]
     if test_temp[1] > 0.1:
         number_of_het += 1
     count += 1
 print("Number of heteroskedasticity is: ", number_of_het)
 del het_pvalue, temp_X, temp_Y, model, temp_resid, count
 del finance_temp, finance_loading_temp, interval, macro_data, mkt, mkt_temp
 #发现异方差问题较为严重,故下面所有回归采用加权最小二乘进行
 # 记录当前所在的是第多少个月
 count = 0
 WLS_Weight = dict()
 for i in range(time.shape[0]):  #从2004到2014中第i年的数据
     X = pd.DataFrame(loading[2004 + i])
     Xtemp = sm.add_constant(X.iloc[:, 0])
     print('Regression on Year', 2000 + i)
Ejemplo n.º 20
0
plt.ylabel('Residual counts',fontsize = 13)
    
plt.plot(RESID[k])
plt.ylabel('Residual amplitude',fontsize = 13)
plt.xlabel('Week',fontsize = 13)

import statsmodels.api as sm
sm.graphics.tsa.plot_acf(RESID[k],lags=25)
np.array(abs(RESID[k])).sum()/(len(RESID[k]))

from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white

df_resid = pd.DataFrame(RESID[k],columns = ['resid'])
X= np.concatenate((np.ones(30).reshape(-1,1),x[-30:,:]),axis = 1)
white = het_white(list(df_resid.resid), X )
breuschpagan = het_breuschpagan(list(df_resid.resid), X )

from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import het_white
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

statecrime_df = sm.datasets.statecrime.load_pandas().data
f ='violent~hs_grad+poverty+single+urban'
statecrime_model = ols(formula=f, data=statecrime_df).fit()
white_test = het_white(np.array(statecrime_model.resid)[-30:], x[-30:,:] )
np.array(statecrime_model.model.exog)[-30:,:]

    ante la presencia de heterocedasticidad, de los contrario, tenemos homocedasticidad.
    Las hipotesis por lo tanto son
    H0: B1=B2=0
    H1: B1!=B2!=0'''

data["e_cuadrado"] = modelo.resid**2
data["x_cuadrado"] = data["x"]**2
formula2 = "e_cuadrado~x+x_cuadrado"
modelo2 = ols(formula=formula2, data=data).fit()
modelo2.summary()
'''Vemos el test de significatividad y nos arroja que el p valor de del estadistico
es de 0.308, por lo tanto no rechazariamos la hipotesis de homocedasticidad '''

#Pero esto mismo lo podemos hacer con het_white de statsmodels
from statsmodels.stats.diagnostic import het_white
het_white(modelo.resid, modelo.model.exog)[
    3]  #Esta linea nos arroja directamente el p valor del estadistico f
#Podemos ver que es el mismo obtenido por el modelo auxiliar, por lo tanto llevamos a la misma conclusion
''' El otro supuesto del modelo de MCO es de que los errores no estan
autocorrelacionados. Para eso vamos a usar el test de durbin watson,
vamos a obtener el estadistico manualmente y luego lo calculamos con statsmodels
'''
list2 = []
for elemento1, elemento2 in zip(
        list(modelo.resid)[1:],
        list(modelo.resid)[0:-1]):
    list2.append((elemento1 - elemento2)**2)

EstadisticoDW = sum(list2) / sum(data["e_cuadrado"])
print(str(EstadisticoDW))  #Es el estadistico Durbin-Watson

from statsmodels.stats.stattools import durbin_watson
Ejemplo n.º 22
0
def heteroskedacity_test(data, rejection_alpha=0.05):
    res = get_residuals(data)
    het_test_results = het_white(resid=res, exog=data.drop('y', axis=1))
    # comparing p-value to alpha
    return het_test_results[-1] < rejection_alpha
Ejemplo n.º 23
0
formula = "Y~X"
model = ols(formula = formula, data = dataset).fit()

plp.scatter(dataset["X"], dataset["Y"], color = "red", marker="o")
plp.plot(dataset["X"], list(modelo.params)[0] + list(modelo.params)[1]*dataset["X"], color="violet", label="lineregression")
plp.title("model")

#We will to draw the residuals, for to know if the model presents heterocedasticity
plp.scatter(dataset["X"], model.resid)

#The residuals follows a patron, and we can to suspicious about heterocedasticity

usquares = model.resid**2
formula = "y ~ x1"
dataset1 = pd.DataFrame(data = {"y": np.log(usquares), "x1": np.log(dataset["X"])})
model1 = ols(formula = formula, data = dataset1).fit()
model1.summary()

#Seems be that the model not presents heterocedasticity, but, we can to do another proofs, for example, the breush-pagan-godfrey proof

umean = np.mean(modelo.resid**2)
dataset["pi"] = modelo.resid**2/umean
formula = "pi~X"
model2 = ols(formula = formula, data = dataset).fit()
model2.summary() #The x coef is not significative, therefore, the model no presents heterocedasticity

#We have the same conclution if we do the proof white
from statsmodels.stats.diagnostic import het_white
het_white(model1.resid, model1.model.exog)[3]
input()
Ejemplo n.º 24
0
    def test_all(self):

        d = macrodata.load().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]],
                            prepend=True)

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],prepend=True)

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)   #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params


        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
                        [-9.50990,  0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # ***
                        [ 4.37040,  0.208146, 21.00,  2.93e-052,  3.95993, 4.78086], # ***
                        [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(
        endog_mean = ("Mean dependent var",   3.113973),
        endog_std = ("S.D. dependent var",   18.67447),
        ssr = ("Sum squared resid",    22530.90),
        mse_resid_sqrt = ("S.E. of regression",   10.66735),
        rsquared = ("R-squared",            0.676973),
        rsquared_adj = ("Adjusted R-squared",   0.673710),
        fvalue = ("F(2, 198)",            221.0475),
        f_pvalue = ("P-value(F)",           3.56e-51),
        resid_acf1 = ("rho",                 -0.003481),
        dw = ("Durbin-Watson",        1.993858))


        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 6)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4)
        assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2 #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 3)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0)
        assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO



        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2,4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2,4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)



        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''

        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''

        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''

        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''


        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
        [-9.48167,      1.17709,     -8.055,    7.17e-014, -11.8029, -7.16049], # ***
        [4.37422,      0.328787,    13.30,     2.62e-029, 3.72587, 5.02258], #***
        [-0.613997,     0.293619,    -2.091,    0.0378, -1.19300, -0.0349939]]) # **

        result_gretl_g1 = dict(
                    endog_mean = ("Mean dependent var",   3.257395),
                    endog_std = ("S.D. dependent var",   18.73915),
                    ssr = ("Sum squared resid",    22799.68),
                    mse_resid_sqrt = ("S.E. of regression",   10.70380),
                    rsquared = ("R-squared",            0.676978),
                    rsquared_adj = ("Adjusted R-squared",   0.673731),
                    fvalue = ("F(2, 199)",            90.79971),
                    f_pvalue = ("P-value(F)",           9.53e-29),
                    llf = ("Log-likelihood",      -763.9752),
                    aic = ("Akaike criterion",     1533.950),
                    bic = ("Schwarz criterion",    1543.875),
                    hqic = ("Hannan-Quinn",         1537.966),
                    resid_acf1 = ("rho",                 -0.107341),
                    dw = ("Durbin-Watson",        2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier  = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196, "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breush_pagan = [1.302014, 0.521520, 2, "chi2"]  #TODO: not available
        het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"]


        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split()
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1,
                            converters={0:lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2,
                                converters={0:lambda s: s})

        lev.dtype.names = names

        res = res_ols #for easier copying

        cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac =  sw.se_cov(cov_hac)

        assert_almost_equal(res.params, partable[:,0], 5)
        assert_almost_equal(bse_hac, partable[:,1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL
        assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        #f-value is based on cov_hac I guess
        #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO


        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6,5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6,5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breushpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
Ejemplo n.º 25
0



















diag.het_goldfeldquandt(modelMR2.resid, modelMR2.model.exog)
diag.het_breuschpagan(modelMR4.resid, modelMR4.model.exog)
diag.het_white(modelMR2.resid, modelMR2.model.exog, retres = False)
diag.(modelMR2.resid, modelMR2.model.exog)


diag.acorr_ljungbox(modelMR2.resid)



    def leastSquared(_Y,
                     _X,
                     preprocessing=True,
                     figid='OLS',
                     path='./TAQImpactOutput/'):
        """
        Perform the least squared error fitting using scipy optimization tools
        :param _Y: Y variable
        :param _X: X variable
        :param preprocessing: whether to preprocess or not
        :param figid: figure - id
        :param path: the output path
        :return: a OptRes object to store the results
        """
        from statsmodels.stats.diagnostic import het_white

        # preprocessing
        if preprocessing:
            _Y, _X = TAQImpactAnalysis.negateObservations(
                *TAQImpactAnalysis.eliminateOutliers(_Y, _X))
        func = lambda params: sum((_Y - params[0] * (_X**params[1]))**2)

        # Output the data in scatterplots
        plt.figure()
        plt.scatter(_X, _Y, alpha=0.5)
        plt.title(r'Scatter plots of $\frac{h}{\sigma}$-$\frac{X}{VT}$')
        plt.savefig(path + figid + '_scatter_xy' + '.png')
        plt.close()

        #do optimization

        res = minimize(func, np.array([0.1, 0.6]))
        eta, beta = res.x
        resids = (_Y - eta * (_X**beta))

        # Output the residuals distribution
        plt.figure()
        plt.hist(resids)
        plt.title('Distribution of residuals')
        plt.savefig(path + figid + '_resid_dist' + '.png')
        plt.close()

        # Check heteroscedasticity

        print('No het with p value {0}'.
              format(het_white(resids, sm.add_constant(_X))[3]
                     ) if het_white(resids, sm.add_constant(_X))[3] < .01 else
              'Het with p value {0}'.
              format(het_white(resids, sm.add_constant(_X))[3]))

        _x_high = max(_X)

        # Plot fitted curve

        plt.figure()
        plt.scatter(_X, _Y, alpha=.3)
        plt.plot(np.arange(0, _x_high, 0.0001),
                 eta * (np.arange(0, _x_high, 0.0001))**beta,
                 c='orange')
        plt.xlabel(r'$\frac{X}{VT}$')
        plt.ylabel(r'$\frac{h}{\sigma}$')
        plt.legend(['Fitted curve'])
        plt.title('Fitted Curve of non-linear regression')
        plt.savefig(path + figid + '_curve_fit' + '.png')
        plt.close()

        # calculate T-values

        tvalues = TAQImpactAnalysis.calTStats(_Y, _X, resids)
        return TAQImpactAnalysis.OptRes(
            (eta, beta), [i / j for i, j in zip((eta, beta), tvalues)])
Ejemplo n.º 27
0
#Money Supply

meanMS = np.mean(DataUse.MS)
sdMS = np.std(DataUse.MS)
varMS = np.var(DataUse.MS)


#Create Model

Model = sma.wls('work.inflation ~ foreign + MS', work).fit()

print(Model.summary())

#Weighted Least Squares used to fix Heteroscedastisity

#Test The Model

Heteroscedastisity = ds.het_white(Model.resid, exog = work)

print('F-statistic %r' % Heteroscedastisity[2])
print('Prob,F %f' % Heteroscedastisity[3])
print('Chi-Square %s' % Heteroscedastisity[0])
print('Prob,Chi-Square %g' % Heteroscedastisity[1])

Autocorrelation = ds.acorr_breusch_godfrey(Model, nlags=(2))

print('F-statistic %r' % Autocorrelation[2])
print('Prob,F %f' % Autocorrelation[3])
print('Chi-Square %s' % Autocorrelation[0])
print('Prob,Chi-Square %g' % Autocorrelation[1])
Ejemplo n.º 28
0
         row = 60+i*12+j 
         Y.iloc[:,count] = ret.iloc[row,:-3]
         count+=1
 total_loading = total_regression.Get_total_loading(ret,tech_data,rm,macro_data,finance_loading[2015],dummy)
 total_loading = pd.DataFrame(total_loading)
 X = total_loading.iloc[:,0]
 X = sm.add_constant(X)
 count = 0
 number_of_het = 0
 het_pvalue = np.zeros([132,1])
 for i in range(132):
     temp_Y = Y.iloc[:,i]
     temp_X = X.iloc[:,(0,1)]
     model = sm.OLS(temp_Y,temp_X).fit()
     temp_resid = model.resid
     test_temp = ss.het_white(temp_resid,temp_X)
     het_pvalue[count,:] = test_temp[1]
     if test_temp[1]>0.1:
         number_of_het+=1
     count+=1 
 print ("Number of heteroskedasticity is: ",number_of_het)
 del het_pvalue,temp_X,temp_Y,model,temp_resid,count
 del finance_temp,finance_loading_temp,interval,macro_data,mkt,mkt_temp    
 #发现异方差问题较为严重,故下面所有回归采用加权最小二乘进行
 # 记录当前所在的是第多少个月
 count = 0
 WLS_Weight = dict()
 for i in range(time.shape[0]):#从2004到2014中第i年的数据
     X = pd.DataFrame(loading[2004+i])
     Xtemp = sm.add_constant(X.iloc[:,0])
     print ('Regression on Year',2000+i)
Ejemplo n.º 29
0
                  hist=False,
                  kde_kws={
                      "shade": True,
                      "lw": 1
                  },
                  fit=st.norm)
ax.set_xlabel("Residuals")
plt.show()

labels = ["Statistic", "p-value"]
norm_res = st.shapiro(mdf.resid)
for key, val in dict(zip(labels, norm_res)).items():
    print(key, val)

fig = plt.figure(figsize=(16, 9))
ax = sns.scatterplot(y=mdf.resid, x=mdf.fittedvalues)
ax.set_xlabel("Fitted Values")
ax.set_ylabel("Residuals")
plt.show()

het_white_res = het_white(mdf.resid, mdf.model.exog)
labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"]
for key, val in dict(zip(labels, het_white_res)).items():
    print(key, val)

fig = plt.figure(figsize=(16, 9))
ax = sns.boxplot(x=mdf.model.groups, y=mdf.resid)
ax.set_ylabel("Residuals")
ax.set_xlabel("Subjects")
plt.show()
Ejemplo n.º 30
0
    print(" The coeff for {} is {}".format(c[0], c[1]))

# Get predictions
y_predict = reg.predict(X_test)

#define input
X2 = sm.add_constant(X)

# create OLS model
model = sm.OLS(Y, X2)

# fit data
est = model.fit()

# test for heteroscedasticity (want p values over 0.05)
_, pval, _, f_pval = diag.het_white(est.resid, est.model.exog, retres=False)
print(pval, f_pval)
print('_' * 100)

_, pval, _, f_pval = diag.het_breuschpagan(est.resid, est.model.exog)
print(pval, f_pval)
print('_' * 100)
# values greater than 0.05 there is no heterodasticity

# test for autocorrelation want p values over 0.05)
lag = min(10, (len(X) // 5))
print("number of lags is {}".format(lag))
ibvalue, pval = diag.acorr_ljungbox(est.resid, lags=lag)
print(min(pval))
print('_' * 100)
Ejemplo n.º 31
0
    def test_all(self):

        d = macrodata.load().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]])

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]])

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)   #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params


        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
                        [-9.50990,  0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # ***
                        [ 4.37040,  0.208146, 21.00,  2.93e-052,  3.95993, 4.78086], # ***
                        [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(
        endog_mean = ("Mean dependent var",   3.113973),
        endog_std = ("S.D. dependent var",   18.67447),
        ssr = ("Sum squared resid",    22530.90),
        mse_resid_sqrt = ("S.E. of regression",   10.66735),
        rsquared = ("R-squared",            0.676973),
        rsquared_adj = ("Adjusted R-squared",   0.673710),
        fvalue = ("F(2, 198)",            221.0475),
        f_pvalue = ("P-value(F)",           3.56e-51),
        resid_acf1 = ("rho",                 -0.003481),
        dw = ("Durbin-Watson",        1.993858))


        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 6)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4)
        assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2 #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 3)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0)
        assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO



        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2,4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2,4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)



        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''

        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''

        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''

        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''


        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
        [-9.48167,      1.17709,     -8.055,    7.17e-014, -11.8029, -7.16049], # ***
        [4.37422,      0.328787,    13.30,     2.62e-029, 3.72587, 5.02258], #***
        [-0.613997,     0.293619,    -2.091,    0.0378, -1.19300, -0.0349939]]) # **

        result_gretl_g1 = dict(
                    endog_mean = ("Mean dependent var",   3.257395),
                    endog_std = ("S.D. dependent var",   18.73915),
                    ssr = ("Sum squared resid",    22799.68),
                    mse_resid_sqrt = ("S.E. of regression",   10.70380),
                    rsquared = ("R-squared",            0.676978),
                    rsquared_adj = ("Adjusted R-squared",   0.673731),
                    fvalue = ("F(2, 199)",            90.79971),
                    f_pvalue = ("P-value(F)",           9.53e-29),
                    llf = ("Log-likelihood",      -763.9752),
                    aic = ("Akaike criterion",     1533.950),
                    bic = ("Schwarz criterion",    1543.875),
                    hqic = ("Hannan-Quinn",         1537.966),
                    resid_acf1 = ("rho",                 -0.107341),
                    dw = ("Durbin-Watson",        2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier  = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196, "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"]  #TODO: not available
        het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"]


        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split()
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1,
                            converters={0:lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2,
                                converters={0:lambda s: s})

        lev.dtype.names = names

        res = res_ols #for easier copying

        cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac =  sw.se_cov(cov_hac)

        assert_almost_equal(res.params, partable[:,0], 5)
        assert_almost_equal(bse_hac, partable[:,1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl
        assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL
        assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        #f-value is based on cov_hac I guess
        #res2 = res.get_robustcov_results(cov_type='HC1')
        # TODO: fvalue differs from Gretl, trying any of the HCx
        #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO


        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6,5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6,5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
Ejemplo n.º 32
0
plt.show()

################ autocorrelation and partial autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(tt)
plot_pacf(tt)

######## seasonal decomposing

from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(tt, model='multiplicative')
result.plot()

from statsmodels.stats.diagnostic import het_white

white_test = het_white(result.resid)

######## LSTM model

model = Sequential()
model.add(
    LSTM(48,
         activation="relu",
         recurrent_activation="relu",
         batch_input_shape=(None, timestep, n_features),
         return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(7))
model.add(LSTM(96, activation="relu", return_sequences=True))
model.add(Dropout(0.3))
model.add(Dense(10))