# <headingcell level=4>

# This block creates the objects for each statistic we'd like printed to Excel.<br>
# <br>
# It will report $R^2$, $R^2 adj,$ residuals, the f p-value, aic, the fitted model parameters, normality of the residuals, the Bruesh-Pagan Test for heteroscedasicity and the Harvey-Collier test for linearity.

# <codecell>

r_squared = model.rsquared
r_square_adj = model.rsquared_adj
residuals = model.resid
p = model.f_pvalue
aic = model.aic
pvalues = pd.DataFrame(model.pvalues)
params = pd.DataFrame(model.params)
normality = sms.jarque_bera(model.resid)
breush_pagan_hska = sms.het_breushpagan(model.resid, model.model.exog)
harvey_collier = sms.linear_harvey_collier(model)

# <headingcell level=4>

# Print the regression results to Excel

# <codecell>

Range("Results", "O6").value = "R^2"
Range("Results", "P6").value = r_squared

Range("Results", "O7").value = "R^2 Adjusted"
Range("Results", "P7").value = r_square_adj
url = 'http://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv'
dat = pd.read_csv(url)

# Fit regression model (using the natural log of one of the regressaors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()

# Inspect the results
print(results.summary())


# ## Normality of the residuals

# Jarque-Bera test:

name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(results.resid)
lzip(name, test)


# Omni test:

name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
lzip(name, test)


# ## Influence tests
# 
# Once created, an object of class ``OLSInfluence`` holds attributes and methods that allow users to assess the influence of each observation. For example, we can compute and extract the first few rows of DFbetas by:

from statsmodels.stats.outliers_influence import OLSInfluence
def Fig_OLS_Checks():

    #fs = 10 # font size used across figures
    #color = str()
    #OrC = 'open'

    SampSizes = [5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    Iterations = 100

    fig = plt.figure(figsize=(12, 8))

    # MODEL PARAMETERS
    Rare_MacIntercept_pVals = [] # List to hold coefficient p-values
    Rare_MacIntercept_Coeffs = [] # List to hold coefficients

    Rich_MacIntercept_pVals = []
    Rich_MacIntercept_Coeffs = []

    Dom_MacIntercept_pVals = []
    Dom_MacIntercept_Coeffs = []

    Even_MacIntercept_pVals = []
    Even_MacIntercept_Coeffs = []

    Rare_MicIntercept_pVals = []
    Rare_MicIntercept_Coeffs = []

    Rich_MicIntercept_pVals = []
    Rich_MicIntercept_Coeffs = []

    Dom_MicIntercept_pVals = []
    Dom_MicIntercept_Coeffs = []

    Even_MicIntercept_pVals = []
    Even_MicIntercept_Coeffs = []


    Rare_MacSlope_pVals = []
    Rare_MacSlope_Coeffs = []

    Rich_MacSlope_pVals = []
    Rich_MacSlope_Coeffs = []

    Dom_MacSlope_pVals = []
    Dom_MacSlope_Coeffs = []

    Even_MacSlope_pVals = []
    Even_MacSlope_Coeffs = []

    Rare_MicSlope_pVals = []
    Rare_MicSlope_Coeffs = []

    Rich_MicSlope_pVals = []
    Rich_MicSlope_Coeffs = []

    Dom_MicSlope_pVals = []
    Dom_MicSlope_Coeffs = []

    Even_MicSlope_pVals = []
    Even_MicSlope_Coeffs = []


    RareR2List = [] # List to hold model R2
    RarepFList = [] # List to hold significance of model R2
    RichR2List = [] # List to hold model R2
    RichpFList = [] # List to hold significance of model R2
    DomR2List = [] # List to hold model R2
    DompFList = [] # List to hold significance of model R2
    EvenR2List = [] # List to hold model R2
    EvenpFList = [] # List to hold significance of model R2

    # ASSUMPTIONS OF LINEAR REGRESSION
    # 1. Error in predictor variables is negligible...presumably yes
    # 2. Variables are measured at the continuous level...yes

    # 3. The relationship is linear
    #RarepLinListHC = []
    RarepLinListRainB = []
    RarepLinListLM = []
    #RichpLinListHC = []
    RichpLinListRainB = []
    RichpLinListLM = []
    #DompLinListHC = []
    DompLinListRainB = []
    DompLinListLM = []
    #EvenpLinListHC = []
    EvenpLinListRainB = []
    EvenpLinListLM = []

    # 4. There are no significant outliers...need to find tests or measures

    # 5. Independence of observations (no serial correlation in residuals)
    RarepCorrListBG = []
    RarepCorrListF = []
    RichpCorrListBG = []
    RichpCorrListF = []
    DompCorrListBG = []
    DompCorrListF = []
    EvenpCorrListBG = []
    EvenpCorrListF = []

    # 6. Homoscedacticity
    RarepHomoHW = []
    RarepHomoHB = []
    RichpHomoHW = []
    RichpHomoHB = []
    DompHomoHW = []
    DompHomoHB = []
    EvenpHomoHW = []
    EvenpHomoHB = []

    # 7. Normally distributed residuals (errors)
    RarepNormListOmni = [] # Omnibus test for normality
    RarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    RarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    RichpNormListOmni = [] # Omnibus test for normality
    RichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    RichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    DompNormListOmni = [] # Omnibus test for normality
    DompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    DompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    DompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    EvenpNormListOmni = [] # Omnibus test for normality
    EvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    EvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    EvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    NLIST = []

    for SampSize in SampSizes:

        sRare_MacIntercept_pVals = [] # List to hold coefficient p-values
        sRare_MacIntercept_Coeffs = [] # List to hold coefficients

        sRich_MacIntercept_pVals = [] # List to hold coefficient p-values
        sRich_MacIntercept_Coeffs = [] # List to hold coefficients

        sDom_MacIntercept_pVals = []
        sDom_MacIntercept_Coeffs = []

        sEven_MacIntercept_pVals = []
        sEven_MacIntercept_Coeffs = []

        sRare_MicIntercept_pVals = []
        sRare_MicIntercept_Coeffs = []

        sRich_MicIntercept_pVals = []
        sRich_MicIntercept_Coeffs = []

        sDom_MicIntercept_pVals = []
        sDom_MicIntercept_Coeffs = []

        sEven_MicIntercept_pVals = []
        sEven_MicIntercept_Coeffs = []


        sRare_MacSlope_pVals = []
        sRare_MacSlope_Coeffs = []

        sRich_MacSlope_pVals = []
        sRich_MacSlope_Coeffs = []

        sDom_MacSlope_pVals = []
        sDom_MacSlope_Coeffs = []

        sEven_MacSlope_pVals = []
        sEven_MacSlope_Coeffs = []

        sRare_MicSlope_pVals = []
        sRare_MicSlope_Coeffs = []

        sRich_MicSlope_pVals = []
        sRich_MicSlope_Coeffs = []

        sDom_MicSlope_pVals = []
        sDom_MicSlope_Coeffs = []

        sEven_MicSlope_pVals = []
        sEven_MicSlope_Coeffs = []


        sRareR2List = [] # List to hold model R2
        sRarepFList = [] # List to hold significance of model R2
        sRichR2List = [] # List to hold model R2
        sRichpFList = [] # List to hold significance of model R2
        sDomR2List = [] # List to hold model R2
        sDompFList = [] # List to hold significance of model R2
        sEvenR2List = [] # List to hold model R2
        sEvenpFList = [] # List to hold significance of model R2

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #sRarepLinListHC = []
        sRarepLinListRainB = []
        sRarepLinListLM = []
        #sRichpLinListHC = []
        sRichpLinListRainB = []
        sRichpLinListLM = []
        #sDompLinListHC = []
        sDompLinListRainB = []
        sDompLinListLM = []
        #sEvenpLinListHC = []
        sEvenpLinListRainB = []
        sEvenpLinListLM = []

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        sRarepCorrListBG = []
        sRarepCorrListF = []
        sRichpCorrListBG = []
        sRichpCorrListF = []
        sDompCorrListBG = []
        sDompCorrListF = []
        sEvenpCorrListBG = []
        sEvenpCorrListF = []

        # 6. Homoscedacticity
        sRarepHomoHW = []
        sRarepHomoHB = []
        sRichpHomoHW = []
        sRichpHomoHB = []
        sDompHomoHW = []
        sDompHomoHB = []
        sEvenpHomoHW = []
        sEvenpHomoHB = []

        # 7. Normally distributed residuals (errors)
        sRarepNormListOmni = [] # Omnibus test for normality
        sRarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

        sRichpNormListOmni = [] # Omnibus test for normality
        sRichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

        sDompNormListOmni = [] # Omnibus test for normality
        sDompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sDompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sDompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

        sEvenpNormListOmni = [] # Omnibus test for normality
        sEvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sEvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sEvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance


        for iteration in range(Iterations):

            Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [[], [], [], [], [], [], []]
            klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [[], [], [], [], [], [], []]
            NmaxList, rareSkews, KindList = [[], [], []]
            NSlist = []

            ct = 0
            radDATA = []
            datasets = []
            GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST


            mlist = ['micro', 'macro']
            for m in mlist:
                for name in os.listdir(mydir +'data/'+m):
                    if name in GoodNames: pass
                    else: continue
                    path = mydir+'data/'+m+'/'+name+'/'+name+'-SADMetricData.txt'
                    num_lines = sum(1 for line in open(path))
                    datasets.append([name, m, num_lines])

            numMac = 0
            numMic = 0

            radDATA = []

            for d in datasets:

                name, kind, numlines = d
                lines = []
                lines = np.random.choice(range(1, numlines+1), SampSize, replace=True)

                path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

                #print name, kind, numlines, len(radDATA)

            for data in radDATA:

                data = data.split()
                if len(data) == 0:
                    print 'no data'
                    continue

                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data


                N = float(N)
                S = float(S)

                Nlist.append(float(np.log(N)))
                Slist.append(float(np.log(S)))
                NSlist.append(float(np.log(N/S)))

                Evarlist.append(float(np.log(float(Evar))))
                ESimplist.append(float(np.log(float(ESimp))))
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log(float(BP)*float(N))))
                EHeiplist.append(float(EHeip))

                # lines for the log-modulo transformation of skewnness
                skew = float(skew)
                sign = 1
                if skew < 0: sign = -1

                lms = np.log(np.abs(skew) + 1)
                lms = lms * sign
                #if lms > 3: print name, N, S
                rareSkews.append(float(lms))

                if kind == 'macro': numMac += 1
                elif kind == 'micro': numMic += 1

                ct+=1


            #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Rarity'] = list(rareSkews)
            d['Kind'] = list(KindList)

            RarityResults = smf.ols('Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Richness'] = list(Slist)
            d['Kind'] = list(KindList)

            RichnessResults = smf.ols('Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print RichnessResults.summary(), '\n'

            # Multiple regression for Dominance
            d = pd.DataFrame({'N': list(Nlist)})
            d['Dominance'] = list(NmaxList)
            d['Kind'] = list(KindList)

            DomResults = smf.ols('Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print DomResults.summary(), '\n'

            # Multiple regression for Evenness
            d = pd.DataFrame({'N': list(Nlist)})
            d['Evenness'] = list(ESimplist)
            d['Kind'] = list(KindList)

            EvenResults = smf.ols('Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            RareResids = RarityResults.resid # residuals of the model
            RichResids = RichnessResults.resid # residuals of the model
            DomResids = DomResults.resid # residuals of the model
            EvenResids = EvenResults.resid # residuals of the model

            # MODEL RESULTS/FIT
            RareFpval = RarityResults.f_pvalue
            Rarer2 = RarityResults.rsquared # coefficient of determination
            #Adj_r2 = RareResults.rsquared_adj # adjusted
            RichFpval = RichnessResults.f_pvalue
            Richr2 = RichnessResults.rsquared # coefficient of determination
            #Adj_r2 = RichnessResults.rsquared_adj # adjusted

            DomFpval = DomResults.f_pvalue
            Domr2 = DomResults.rsquared # coefficient of determination
            #Adj_r2 = DomResults.rsquared_adj # adjusted
            EvenFpval = EvenResults.f_pvalue
            Evenr2 = EvenResults.rsquared # coefficient of determination
            #Adj_r2 = EvenResuls.rsquared_adj # adjusted

            # MODEL PARAMETERS and p-values
            Rareparams = RarityResults.params
            Rareparams = Rareparams.tolist()
            Rarepvals = RarityResults.pvalues
            Rarepvals = Rarepvals.tolist()

            Richparams = RichnessResults.params
            Richparams = Richparams.tolist()
            Richpvals = RichnessResults.pvalues
            Richpvals = Richpvals.tolist()

            Domparams = DomResults.params
            Domparams = Domparams.tolist()
            Dompvals = DomResults.pvalues
            Dompvals = Dompvals.tolist()

            Evenparams = EvenResults.params
            Evenparams = Evenparams.tolist()
            Evenpvals = EvenResults.pvalues
            Evenpvals = Evenpvals.tolist()


            sRare_MacIntercept_pVals.append(Rarepvals[0])
            sRare_MacIntercept_Coeffs.append(Rareparams[0])

            sRich_MacIntercept_pVals.append(Rarepvals[0])
            sRich_MacIntercept_Coeffs.append(Rareparams[0])

            sDom_MacIntercept_pVals.append(Dompvals[0])
            sDom_MacIntercept_Coeffs.append(Domparams[0])

            sEven_MacIntercept_pVals.append(Evenpvals[0])
            sEven_MacIntercept_Coeffs.append(Evenparams[0])

            sRare_MicIntercept_pVals.append(Rarepvals[1])
            if Rarepvals[1] > 0.05:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])
            else:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])

            sRich_MicIntercept_pVals.append(Richpvals[1])
            if Richpvals[1] > 0.05:
                sRich_MicIntercept_Coeffs.append(Richparams[1])
            else:
                sRich_MicIntercept_Coeffs.append(Richparams[1])

            sDom_MicIntercept_pVals.append(Dompvals[1])
            if Dompvals[1] > 0.05:
                sDom_MicIntercept_Coeffs.append(Domparams[1])
            else:
                sDom_MicIntercept_Coeffs.append(Domparams[1])

            sEven_MicIntercept_pVals.append(Evenpvals[1])
            if Evenpvals[1] > 0.05:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])
            else:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])


            sRare_MacSlope_pVals.append(Rarepvals[2])
            sRare_MacSlope_Coeffs.append(Rareparams[2])

            sRich_MacSlope_pVals.append(Richpvals[2])
            sRich_MacSlope_Coeffs.append(Richparams[2])

            sDom_MacSlope_pVals.append(Dompvals[2])
            sDom_MacSlope_Coeffs.append(Domparams[2])

            sEven_MacSlope_pVals.append(Evenpvals[2])
            sEven_MacSlope_Coeffs.append(Evenparams[2])


            sRare_MicSlope_pVals.append(Rarepvals[3])
            if Rarepvals[3] > 0.05:
                sRare_MicSlope_Coeffs.append(Rareparams[3])
            else:
                sRare_MicSlope_Coeffs.append(Rareparams[3])

            sRich_MicSlope_pVals.append(Richpvals[3])
            if Richpvals[3] > 0.05:
                sRich_MicSlope_Coeffs.append(Richparams[3])
            else:
                sRich_MicSlope_Coeffs.append(Richparams[3])

            sDom_MicSlope_pVals.append(Dompvals[3])
            if Dompvals[3] > 0.05:
                sDom_MicSlope_Coeffs.append(Domparams[3])
            else:
                sDom_MicSlope_Coeffs.append(Domparams[3])

            sEven_MicSlope_pVals.append(Evenpvals[3])
            if Evenpvals[3] > 0.05:
                sEven_MicSlope_Coeffs.append(Evenparams[3])
            else:
                sEven_MicSlope_Coeffs.append(Evenparams[3])

            sRareR2List.append(Rarer2)
            sRarepFList.append(RareFpval)
            sRichR2List.append(Richr2)
            sRichpFList.append(RichFpval)
            sDomR2List.append(Domr2)
            sDompFList.append(DomFpval)
            sEvenR2List.append(Evenr2)
            sEvenpFList.append(EvenFpval)

            # TESTS OF LINEAR REGRESSION ASSUMPTIONS
            # Error in predictor variables is negligible...Presumably Yes
            # Variables are measured at the continuous level...Definitely Yes

            # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR
            #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sRarepLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sDompLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sEvenpLinListHC.append(HC)

            RB = smd.linear_rainbow(RarityResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRarepLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(RichnessResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRichpLinListRainB.append(RB[1])

            RB = smd.linear_rainbow(DomResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sDompLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(EvenResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sEvenpLinListRainB.append(RB[1])

            LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog) # Lagrangian multiplier test for linearity
            sRarepLinListLM.append(LM[1])
            LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog) # Lagrangian multiplier test for linearity
            sRichpLinListLM.append(LM[1])

            LM = smd.linear_lm(DomResults.resid, DomResults.model.exog) # Lagrangian multiplier test for linearity
            sDompLinListLM.append(LM[1])
            LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog) # Lagrangian multiplier test for linearity
            sEvenpLinListLM.append(LM[1])

            # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals)
            BGtest = smd.acorr_breush_godfrey(RarityResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True)
            sRarepCorrListBG.append(BGtest[1])
            sRarepCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(RichnessResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True)
            sRichpCorrListBG.append(BGtest[1])
            sRichpCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(DomResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True)
            sDompCorrListBG.append(BGtest[1])
            sDompCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(EvenResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True)
            sEvenpCorrListBG.append(BGtest[1])
            sEvenpCorrListF.append(BGtest[3])

            # There are no significant outliers...Need tests or measures/metrics

            # HOMOSCEDASTICITY

            # These tests return:
            # 1. lagrange multiplier statistic,
            # 2. p-value of lagrange multiplier test,
            # 3. f-statistic of the hypothesis that the error variance does not depend on x,
            # 4. p-value for the f-statistic

            HW = sms.het_white(RareResids, RarityResults.model.exog)
            sRarepHomoHW.append(HW[3])
            HW = sms.het_white(RichResids, RichnessResults.model.exog)
            sRichpHomoHW.append(HW[3])

            HW = sms.het_white(DomResids, DomResults.model.exog)
            sDompHomoHW.append(HW[3])
            HW = sms.het_white(EvenResids, EvenResults.model.exog)
            sEvenpHomoHW.append(HW[3])

            HB = sms.het_breushpagan(RareResids, RarityResults.model.exog)
            sRarepHomoHB.append(HB[3])
            HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog)
            sRichpHomoHB.append(HB[3])

            HB = sms.het_breushpagan(DomResids, DomResults.model.exog)
            sDompHomoHB.append(HB[3])
            HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog)
            sEvenpHomoHB.append(HB[3])

            # 7. NORMALITY OF ERROR TERMS
            O = sms.omni_normtest(RareResids)
            sRarepNormListOmni.append(O[1])
            O = sms.omni_normtest(RichResids)
            sRichpNormListOmni.append(O[1])
            O = sms.omni_normtest(DomResids)
            sDompNormListOmni.append(O[1])
            O = sms.omni_normtest(EvenResids)
            sEvenpNormListOmni.append(O[1])

            JB = sms.jarque_bera(RareResids)
            sRarepNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(RichResids)
            sRichpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(DomResids)
            sDompNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(EvenResids)
            sEvenpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality

            KS = smd.kstest_normal(RareResids)
            sRarepNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(RichResids)
            sRichpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(DomResids)
            sDompNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(EvenResids)
            sEvenpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance

            AD = smd.normal_ad(RareResids)
            sRarepNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(RichResids)
            sRichpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(DomResids)
            sDompNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(EvenResids)
            sEvenpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance

            print 'Sample size:',SampSize, 'iteration:',iteration

        NLIST.append(SampSize)

        Rare_MacIntercept_pVals.append(np.mean(sRare_MacIntercept_pVals)) # List to hold coefficient p-values
        Rare_MacIntercept_Coeffs.append(np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients

        Rich_MacIntercept_pVals.append(np.mean(sRich_MacIntercept_pVals)) # List to hold coefficient p-values
        Rich_MacIntercept_Coeffs.append(np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients

        Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals))
        Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs))

        Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals))
        Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs))

        Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals))
        Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs))

        Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals))
        Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs))

        Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals))
        Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs))

        Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals))
        Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs))

        Rare_MacSlope_pVals.append(np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values
        Rare_MacSlope_Coeffs.append(np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients

        Rich_MacSlope_pVals.append(np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values
        Rich_MacSlope_Coeffs.append(np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients

        Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals))
        Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs))

        Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals))
        Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs))

        Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals))
        Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs))

        Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals))
        Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs))

        Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals))
        Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs))

        Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals))
        Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs))


        RareR2List.append(np.mean(sRareR2List))
        RarepFList.append(np.mean(sRarepFList))
        RichR2List.append(np.mean(sRichR2List))
        RichpFList.append(np.mean(sRichpFList))
        DomR2List.append(np.mean(sDomR2List))
        DompFList.append(np.mean(sDompFList))
        EvenR2List.append(np.mean(sEvenR2List))
        EvenpFList.append(np.mean(sEvenpFList))

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #RarepLinListHC.append(np.mean(sRarepLinListHC))
        RarepLinListRainB.append(np.mean(sRarepLinListRainB))
        RarepLinListLM.append(np.mean(sRarepLinListLM))
        #RichpLinListHC.append(np.mean(sRichpLinListHC))
        RichpLinListRainB.append(np.mean(sRichpLinListRainB))
        RichpLinListLM.append(np.mean(sRichpLinListLM))
        #DompLinListHC.append(np.mean(sDompLinListHC))
        DompLinListRainB.append(np.mean(sDompLinListRainB))
        DompLinListLM.append(np.mean(sDompLinListLM))
        #EvenpLinListHC.append(np.mean(sEvenpLinListHC))
        EvenpLinListRainB.append(np.mean(sEvenpLinListRainB))
        EvenpLinListLM.append(np.mean(sEvenpLinListLM))

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        RarepCorrListBG.append(np.mean(sRarepCorrListBG))
        RarepCorrListF.append(np.mean(sRarepCorrListF))
        RichpCorrListBG.append(np.mean(sRichpCorrListBG))
        RichpCorrListF.append(np.mean(sRichpCorrListF))
        DompCorrListBG.append(np.mean(sDompCorrListBG))
        DompCorrListF.append(np.mean(sDompCorrListF))
        EvenpCorrListBG.append(np.mean(sEvenpCorrListBG))
        EvenpCorrListF.append(np.mean(sEvenpCorrListF))

        # 6. Homoscedacticity
        RarepHomoHW.append(np.mean(sRarepHomoHW))
        RarepHomoHB.append(np.mean(sRarepHomoHB))
        RichpHomoHB.append(np.mean(sRichpHomoHB))
        RichpHomoHW.append(np.mean(sRichpHomoHW))
        DompHomoHW.append(np.mean(sDompHomoHW))
        DompHomoHB.append(np.mean(sDompHomoHB))
        EvenpHomoHW.append(np.mean(sEvenpHomoHW))
        EvenpHomoHB.append(np.mean(sEvenpHomoHB))

        # 7. Normally distributed residuals (errors)
        RarepNormListOmni.append(np.mean(sRarepNormListOmni))
        RarepNormListJB.append(np.mean(sRarepNormListJB))
        RarepNormListKS.append(np.mean(sRarepNormListKS))
        RarepNormListAD.append(np.mean(sRarepNormListAD))

        RichpNormListOmni.append(np.mean(sRichpNormListOmni))
        RichpNormListJB.append(np.mean(sRichpNormListJB))
        RichpNormListKS.append(np.mean(sRichpNormListKS))
        RichpNormListAD.append(np.mean(sRichpNormListAD))

        DompNormListOmni.append(np.mean(sDompNormListOmni))
        DompNormListJB.append(np.mean(sDompNormListJB))
        DompNormListKS.append(np.mean(sDompNormListKS))
        DompNormListAD.append(np.mean(sDompNormListAD))

        EvenpNormListOmni.append(np.mean(sEvenpNormListOmni))
        EvenpNormListJB.append(np.mean(sEvenpNormListJB))
        EvenpNormListKS.append(np.mean(sEvenpNormListKS))
        EvenpNormListAD.append(np.mean(sEvenpNormListAD))


    fig.add_subplot(4, 3, 1)
    plt.xlim(min(SampSizes)-1,max(SampSizes)+10)
    plt.ylim(0,1)
    plt.xscale('log')
    # Rarity    R2 vs. Sample Size
    plt.plot(NLIST,RareR2List,  c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16)
    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 2)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    plt.ylim(0.0, 0.16)
    # Rarity    Coeffs vs. Sample Size
    plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rare_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 3)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.ylim(0.0, 0.6)
    plt.xscale('log')
    # Rarity    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RarepLinListRainB,  c='m')
    plt.plot(NLIST,RarepLinListLM,  c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RarepCorrListBG,  c='c')
    plt.plot(NLIST,RarepCorrListF,  c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST,RarepHomoHW,  c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RarepHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST,RarepNormListOmni,  c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RarepNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RarepNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RarepNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')

    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 4)
    plt.xscale('log')
    plt.ylim(0,1)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 5)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Dominance     Coeffs vs. Sample Size
    plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Dom_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')

    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 6)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    #plt.yscale('log')
    plt.ylim(0, 0.6)
    # Dominance     p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, DompLinListRainB, c='m')
    plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, DompCorrListBG, c='c')
    plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, DompHomoHB, c='r',ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-')
    #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3)
    #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 7)
    plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16)
    plt.xscale('log')
    plt.ylim(0,1)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Evenness      R2 vs. Sample Size
    plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 8)
    plt.ylim(-0.25, 0.0)
    plt.xscale('log')
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Evenness      Coeffs vs. Sample Size
    plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Even_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 9)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    plt.ylim(0.0, 0.3)
    # Evenness      p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, EvenpLinListRainB, c='m')
    plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, EvenpCorrListBG, c='c')
    plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-')
    #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3)
    #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 10)
    plt.xscale('log')
    plt.ylim(0,1)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.xlabel('Sample size', fontsize=14)
    plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 11)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Richness    Coeffs vs. Sample Size
    plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rich_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    plt.xlabel('Sample size', fontsize=14)

    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 12)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    # Richness    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RichpLinListRainB,  c='m')
    plt.plot(NLIST,RichpLinListLM,  c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RichpCorrListBG,  c='c')
    plt.plot(NLIST, EvenpCorrListF,  c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST,RichpHomoHW,  c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RichpHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST,RichpNormListOmni,  c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RichpNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RichpNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RichpNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    plt.xlabel('Sample size', fontsize=14)
    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)
    #plt.tick_params(axis='both', which='major', labelsize=fs-3)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    plt.savefig(mydir+'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches = "tight")
    #plt.close()
    #plt.show()

    return
Example #4
0
# ADF検定の結果、日中立会の間のは終値-始値の動きは、ドリフト無し、ドリフト付き、ドリフト+確定的トレンド付きモデルでランダムウォークであるという帰無仮説を棄却しています。また、日中立会と夜間立会の間、夜間立会の間、夜間立会から日中立会の間でも一緒です。

# In[115]:


#各種統計量の算出、比較
import statsmodels.api as sm
import numpy as np
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
print('adf nc',sm.tsa.adfuller(session.s1,regression='nc')[1]) #[1]はp値の検定結果
print('adf  c',sm.tsa.adfuller(session.s1,regression='c')[1]) #[1]はp値の検定結果
print('adf ct',sm.tsa.adfuller(session.s1,regression='ct')[1]) #[1]はp値の検定結果
print(session.s1.mean()/session.s1.std()*np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s1)
print('s1: ',lzip(estimator, test))


# In[116]:


print(sm.tsa.adfuller(session.s12,regression='nc')[1]) #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12,regression='c')[1]) #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12,regression='ct')[1]) #[1]はp値の検定結果
print(session.s1.mean()/session.s12.std()*np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s12)
print('s12: ',lzip(estimator, test))

def Fig_OLS_Checks():

    #fs = 10 # font size used across figures
    #color = str()
    #OrC = 'open'

    SampSizes = [
        5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100
    ]
    Iterations = 100

    fig = plt.figure(figsize=(12, 8))

    # MODEL PARAMETERS
    Rare_MacIntercept_pVals = []  # List to hold coefficient p-values
    Rare_MacIntercept_Coeffs = []  # List to hold coefficients

    Rich_MacIntercept_pVals = []
    Rich_MacIntercept_Coeffs = []

    Dom_MacIntercept_pVals = []
    Dom_MacIntercept_Coeffs = []

    Even_MacIntercept_pVals = []
    Even_MacIntercept_Coeffs = []

    Rare_MicIntercept_pVals = []
    Rare_MicIntercept_Coeffs = []

    Rich_MicIntercept_pVals = []
    Rich_MicIntercept_Coeffs = []

    Dom_MicIntercept_pVals = []
    Dom_MicIntercept_Coeffs = []

    Even_MicIntercept_pVals = []
    Even_MicIntercept_Coeffs = []

    Rare_MacSlope_pVals = []
    Rare_MacSlope_Coeffs = []

    Rich_MacSlope_pVals = []
    Rich_MacSlope_Coeffs = []

    Dom_MacSlope_pVals = []
    Dom_MacSlope_Coeffs = []

    Even_MacSlope_pVals = []
    Even_MacSlope_Coeffs = []

    Rare_MicSlope_pVals = []
    Rare_MicSlope_Coeffs = []

    Rich_MicSlope_pVals = []
    Rich_MicSlope_Coeffs = []

    Dom_MicSlope_pVals = []
    Dom_MicSlope_Coeffs = []

    Even_MicSlope_pVals = []
    Even_MicSlope_Coeffs = []

    RareR2List = []  # List to hold model R2
    RarepFList = []  # List to hold significance of model R2
    RichR2List = []  # List to hold model R2
    RichpFList = []  # List to hold significance of model R2
    DomR2List = []  # List to hold model R2
    DompFList = []  # List to hold significance of model R2
    EvenR2List = []  # List to hold model R2
    EvenpFList = []  # List to hold significance of model R2

    # ASSUMPTIONS OF LINEAR REGRESSION
    # 1. Error in predictor variables is negligible...presumably yes
    # 2. Variables are measured at the continuous level...yes

    # 3. The relationship is linear
    #RarepLinListHC = []
    RarepLinListRainB = []
    RarepLinListLM = []
    #RichpLinListHC = []
    RichpLinListRainB = []
    RichpLinListLM = []
    #DompLinListHC = []
    DompLinListRainB = []
    DompLinListLM = []
    #EvenpLinListHC = []
    EvenpLinListRainB = []
    EvenpLinListLM = []

    # 4. There are no significant outliers...need to find tests or measures

    # 5. Independence of observations (no serial correlation in residuals)
    RarepCorrListBG = []
    RarepCorrListF = []
    RichpCorrListBG = []
    RichpCorrListF = []
    DompCorrListBG = []
    DompCorrListF = []
    EvenpCorrListBG = []
    EvenpCorrListF = []

    # 6. Homoscedacticity
    RarepHomoHW = []
    RarepHomoHB = []
    RichpHomoHW = []
    RichpHomoHB = []
    DompHomoHW = []
    DompHomoHB = []
    EvenpHomoHW = []
    EvenpHomoHB = []

    # 7. Normally distributed residuals (errors)
    RarepNormListOmni = []  # Omnibus test for normality
    RarepNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    RarepNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RarepNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    RichpNormListOmni = []  # Omnibus test for normality
    RichpNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    RichpNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RichpNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    DompNormListOmni = []  # Omnibus test for normality
    DompNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    DompNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    DompNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    EvenpNormListOmni = []  # Omnibus test for normality
    EvenpNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    EvenpNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    EvenpNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    NLIST = []

    for SampSize in SampSizes:

        sRare_MacIntercept_pVals = []  # List to hold coefficient p-values
        sRare_MacIntercept_Coeffs = []  # List to hold coefficients

        sRich_MacIntercept_pVals = []  # List to hold coefficient p-values
        sRich_MacIntercept_Coeffs = []  # List to hold coefficients

        sDom_MacIntercept_pVals = []
        sDom_MacIntercept_Coeffs = []

        sEven_MacIntercept_pVals = []
        sEven_MacIntercept_Coeffs = []

        sRare_MicIntercept_pVals = []
        sRare_MicIntercept_Coeffs = []

        sRich_MicIntercept_pVals = []
        sRich_MicIntercept_Coeffs = []

        sDom_MicIntercept_pVals = []
        sDom_MicIntercept_Coeffs = []

        sEven_MicIntercept_pVals = []
        sEven_MicIntercept_Coeffs = []

        sRare_MacSlope_pVals = []
        sRare_MacSlope_Coeffs = []

        sRich_MacSlope_pVals = []
        sRich_MacSlope_Coeffs = []

        sDom_MacSlope_pVals = []
        sDom_MacSlope_Coeffs = []

        sEven_MacSlope_pVals = []
        sEven_MacSlope_Coeffs = []

        sRare_MicSlope_pVals = []
        sRare_MicSlope_Coeffs = []

        sRich_MicSlope_pVals = []
        sRich_MicSlope_Coeffs = []

        sDom_MicSlope_pVals = []
        sDom_MicSlope_Coeffs = []

        sEven_MicSlope_pVals = []
        sEven_MicSlope_Coeffs = []

        sRareR2List = []  # List to hold model R2
        sRarepFList = []  # List to hold significance of model R2
        sRichR2List = []  # List to hold model R2
        sRichpFList = []  # List to hold significance of model R2
        sDomR2List = []  # List to hold model R2
        sDompFList = []  # List to hold significance of model R2
        sEvenR2List = []  # List to hold model R2
        sEvenpFList = []  # List to hold significance of model R2

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #sRarepLinListHC = []
        sRarepLinListRainB = []
        sRarepLinListLM = []
        #sRichpLinListHC = []
        sRichpLinListRainB = []
        sRichpLinListLM = []
        #sDompLinListHC = []
        sDompLinListRainB = []
        sDompLinListLM = []
        #sEvenpLinListHC = []
        sEvenpLinListRainB = []
        sEvenpLinListLM = []

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        sRarepCorrListBG = []
        sRarepCorrListF = []
        sRichpCorrListBG = []
        sRichpCorrListF = []
        sDompCorrListBG = []
        sDompCorrListF = []
        sEvenpCorrListBG = []
        sEvenpCorrListF = []

        # 6. Homoscedacticity
        sRarepHomoHW = []
        sRarepHomoHB = []
        sRichpHomoHW = []
        sRichpHomoHB = []
        sDompHomoHW = []
        sDompHomoHB = []
        sEvenpHomoHW = []
        sEvenpHomoHB = []

        # 7. Normally distributed residuals (errors)
        sRarepNormListOmni = []  # Omnibus test for normality
        sRarepNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRarepNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRarepNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        sRichpNormListOmni = []  # Omnibus test for normality
        sRichpNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRichpNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRichpNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        sDompNormListOmni = []  # Omnibus test for normality
        sDompNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sDompNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sDompNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        sEvenpNormListOmni = []  # Omnibus test for normality
        sEvenpNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sEvenpNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sEvenpNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        for iteration in range(Iterations):

            Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [
                [], [], [], [], [], [], []
            ]
            klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [
                [], [], [], [], [], [], []
            ]
            NmaxList, rareSkews, KindList = [[], [], []]
            NSlist = []

            ct = 0
            radDATA = []
            datasets = []
            GoodNames = [
                'EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB',
                'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS',
                'CBC', 'MCDB', 'GENTRY', 'FIA'
            ]  # all microbe data is MGRAST

            mlist = ['micro', 'macro']
            for m in mlist:
                for name in os.listdir(mydir + 'data/' + m):
                    if name in GoodNames: pass
                    else: continue
                    path = mydir + 'data/' + m + '/' + name + '/' + name + '-SADMetricData.txt'
                    num_lines = sum(1 for line in open(path))
                    datasets.append([name, m, num_lines])

            numMac = 0
            numMic = 0

            radDATA = []

            for d in datasets:

                name, kind, numlines = d
                lines = []
                lines = np.random.choice(range(1, numlines + 1),
                                         SampSize,
                                         replace=True)

                path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

                #print name, kind, numlines, len(radDATA)

            for data in radDATA:

                data = data.split()
                if len(data) == 0:
                    print 'no data'
                    continue

                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)

                Nlist.append(float(np.log(N)))
                Slist.append(float(np.log(S)))
                NSlist.append(float(np.log(N / S)))

                Evarlist.append(float(np.log(float(Evar))))
                ESimplist.append(float(np.log(float(ESimp))))
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log(float(BP) * float(N))))
                EHeiplist.append(float(EHeip))

                # lines for the log-modulo transformation of skewnness
                skew = float(skew)
                sign = 1
                if skew < 0: sign = -1

                lms = np.log(np.abs(skew) + 1)
                lms = lms * sign
                #if lms > 3: print name, N, S
                rareSkews.append(float(lms))

                if kind == 'macro': numMac += 1
                elif kind == 'micro': numMic += 1

                ct += 1

            #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Rarity'] = list(rareSkews)
            d['Kind'] = list(KindList)

            RarityResults = smf.ols(
                'Rarity ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Richness'] = list(Slist)
            d['Kind'] = list(KindList)

            RichnessResults = smf.ols(
                'Richness ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print RichnessResults.summary(), '\n'

            # Multiple regression for Dominance
            d = pd.DataFrame({'N': list(Nlist)})
            d['Dominance'] = list(NmaxList)
            d['Kind'] = list(KindList)

            DomResults = smf.ols(
                'Dominance ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print DomResults.summary(), '\n'

            # Multiple regression for Evenness
            d = pd.DataFrame({'N': list(Nlist)})
            d['Evenness'] = list(ESimplist)
            d['Kind'] = list(KindList)

            EvenResults = smf.ols(
                'Evenness ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            RareResids = RarityResults.resid  # residuals of the model
            RichResids = RichnessResults.resid  # residuals of the model
            DomResids = DomResults.resid  # residuals of the model
            EvenResids = EvenResults.resid  # residuals of the model

            # MODEL RESULTS/FIT
            RareFpval = RarityResults.f_pvalue
            Rarer2 = RarityResults.rsquared  # coefficient of determination
            #Adj_r2 = RareResults.rsquared_adj # adjusted
            RichFpval = RichnessResults.f_pvalue
            Richr2 = RichnessResults.rsquared  # coefficient of determination
            #Adj_r2 = RichnessResults.rsquared_adj # adjusted

            DomFpval = DomResults.f_pvalue
            Domr2 = DomResults.rsquared  # coefficient of determination
            #Adj_r2 = DomResults.rsquared_adj # adjusted
            EvenFpval = EvenResults.f_pvalue
            Evenr2 = EvenResults.rsquared  # coefficient of determination
            #Adj_r2 = EvenResuls.rsquared_adj # adjusted

            # MODEL PARAMETERS and p-values
            Rareparams = RarityResults.params
            Rareparams = Rareparams.tolist()
            Rarepvals = RarityResults.pvalues
            Rarepvals = Rarepvals.tolist()

            Richparams = RichnessResults.params
            Richparams = Richparams.tolist()
            Richpvals = RichnessResults.pvalues
            Richpvals = Richpvals.tolist()

            Domparams = DomResults.params
            Domparams = Domparams.tolist()
            Dompvals = DomResults.pvalues
            Dompvals = Dompvals.tolist()

            Evenparams = EvenResults.params
            Evenparams = Evenparams.tolist()
            Evenpvals = EvenResults.pvalues
            Evenpvals = Evenpvals.tolist()

            sRare_MacIntercept_pVals.append(Rarepvals[0])
            sRare_MacIntercept_Coeffs.append(Rareparams[0])

            sRich_MacIntercept_pVals.append(Rarepvals[0])
            sRich_MacIntercept_Coeffs.append(Rareparams[0])

            sDom_MacIntercept_pVals.append(Dompvals[0])
            sDom_MacIntercept_Coeffs.append(Domparams[0])

            sEven_MacIntercept_pVals.append(Evenpvals[0])
            sEven_MacIntercept_Coeffs.append(Evenparams[0])

            sRare_MicIntercept_pVals.append(Rarepvals[1])
            if Rarepvals[1] > 0.05:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])
            else:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])

            sRich_MicIntercept_pVals.append(Richpvals[1])
            if Richpvals[1] > 0.05:
                sRich_MicIntercept_Coeffs.append(Richparams[1])
            else:
                sRich_MicIntercept_Coeffs.append(Richparams[1])

            sDom_MicIntercept_pVals.append(Dompvals[1])
            if Dompvals[1] > 0.05:
                sDom_MicIntercept_Coeffs.append(Domparams[1])
            else:
                sDom_MicIntercept_Coeffs.append(Domparams[1])

            sEven_MicIntercept_pVals.append(Evenpvals[1])
            if Evenpvals[1] > 0.05:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])
            else:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])

            sRare_MacSlope_pVals.append(Rarepvals[2])
            sRare_MacSlope_Coeffs.append(Rareparams[2])

            sRich_MacSlope_pVals.append(Richpvals[2])
            sRich_MacSlope_Coeffs.append(Richparams[2])

            sDom_MacSlope_pVals.append(Dompvals[2])
            sDom_MacSlope_Coeffs.append(Domparams[2])

            sEven_MacSlope_pVals.append(Evenpvals[2])
            sEven_MacSlope_Coeffs.append(Evenparams[2])

            sRare_MicSlope_pVals.append(Rarepvals[3])
            if Rarepvals[3] > 0.05:
                sRare_MicSlope_Coeffs.append(Rareparams[3])
            else:
                sRare_MicSlope_Coeffs.append(Rareparams[3])

            sRich_MicSlope_pVals.append(Richpvals[3])
            if Richpvals[3] > 0.05:
                sRich_MicSlope_Coeffs.append(Richparams[3])
            else:
                sRich_MicSlope_Coeffs.append(Richparams[3])

            sDom_MicSlope_pVals.append(Dompvals[3])
            if Dompvals[3] > 0.05:
                sDom_MicSlope_Coeffs.append(Domparams[3])
            else:
                sDom_MicSlope_Coeffs.append(Domparams[3])

            sEven_MicSlope_pVals.append(Evenpvals[3])
            if Evenpvals[3] > 0.05:
                sEven_MicSlope_Coeffs.append(Evenparams[3])
            else:
                sEven_MicSlope_Coeffs.append(Evenparams[3])

            sRareR2List.append(Rarer2)
            sRarepFList.append(RareFpval)
            sRichR2List.append(Richr2)
            sRichpFList.append(RichFpval)
            sDomR2List.append(Domr2)
            sDompFList.append(DomFpval)
            sEvenR2List.append(Evenr2)
            sEvenpFList.append(EvenFpval)

            # TESTS OF LINEAR REGRESSION ASSUMPTIONS
            # Error in predictor variables is negligible...Presumably Yes
            # Variables are measured at the continuous level...Definitely Yes

            # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR
            #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sRarepLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sDompLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sEvenpLinListHC.append(HC)

            RB = smd.linear_rainbow(
                RarityResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRarepLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(
                RichnessResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRichpLinListRainB.append(RB[1])

            RB = smd.linear_rainbow(
                DomResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sDompLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(
                EvenResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sEvenpLinListRainB.append(RB[1])

            LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sRarepLinListLM.append(LM[1])
            LM = smd.linear_lm(RichnessResults.resid,
                               RichnessResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sRichpLinListLM.append(LM[1])

            LM = smd.linear_lm(DomResults.resid, DomResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sDompLinListLM.append(LM[1])
            LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sEvenpLinListLM.append(LM[1])

            # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals)
            BGtest = smd.acorr_breush_godfrey(
                RarityResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True)
            sRarepCorrListBG.append(BGtest[1])
            sRarepCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(
                RichnessResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True)
            sRichpCorrListBG.append(BGtest[1])
            sRichpCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(
                DomResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True)
            sDompCorrListBG.append(BGtest[1])
            sDompCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(
                EvenResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True)
            sEvenpCorrListBG.append(BGtest[1])
            sEvenpCorrListF.append(BGtest[3])

            # There are no significant outliers...Need tests or measures/metrics

            # HOMOSCEDASTICITY

            # These tests return:
            # 1. lagrange multiplier statistic,
            # 2. p-value of lagrange multiplier test,
            # 3. f-statistic of the hypothesis that the error variance does not depend on x,
            # 4. p-value for the f-statistic

            HW = sms.het_white(RareResids, RarityResults.model.exog)
            sRarepHomoHW.append(HW[3])
            HW = sms.het_white(RichResids, RichnessResults.model.exog)
            sRichpHomoHW.append(HW[3])

            HW = sms.het_white(DomResids, DomResults.model.exog)
            sDompHomoHW.append(HW[3])
            HW = sms.het_white(EvenResids, EvenResults.model.exog)
            sEvenpHomoHW.append(HW[3])

            HB = sms.het_breushpagan(RareResids, RarityResults.model.exog)
            sRarepHomoHB.append(HB[3])
            HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog)
            sRichpHomoHB.append(HB[3])

            HB = sms.het_breushpagan(DomResids, DomResults.model.exog)
            sDompHomoHB.append(HB[3])
            HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog)
            sEvenpHomoHB.append(HB[3])

            # 7. NORMALITY OF ERROR TERMS
            O = sms.omni_normtest(RareResids)
            sRarepNormListOmni.append(O[1])
            O = sms.omni_normtest(RichResids)
            sRichpNormListOmni.append(O[1])
            O = sms.omni_normtest(DomResids)
            sDompNormListOmni.append(O[1])
            O = sms.omni_normtest(EvenResids)
            sEvenpNormListOmni.append(O[1])

            JB = sms.jarque_bera(RareResids)
            sRarepNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(RichResids)
            sRichpNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(DomResids)
            sDompNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(EvenResids)
            sEvenpNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality

            KS = smd.kstest_normal(RareResids)
            sRarepNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(RichResids)
            sRichpNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(DomResids)
            sDompNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(EvenResids)
            sEvenpNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance

            AD = smd.normal_ad(RareResids)
            sRarepNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(RichResids)
            sRichpNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(DomResids)
            sDompNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(EvenResids)
            sEvenpNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance

            print 'Sample size:', SampSize, 'iteration:', iteration

        NLIST.append(SampSize)

        Rare_MacIntercept_pVals.append(np.mean(
            sRare_MacIntercept_pVals))  # List to hold coefficient p-values
        Rare_MacIntercept_Coeffs.append(
            np.mean(sRare_MacIntercept_Coeffs))  # List to hold coefficients

        Rich_MacIntercept_pVals.append(np.mean(
            sRich_MacIntercept_pVals))  # List to hold coefficient p-values
        Rich_MacIntercept_Coeffs.append(
            np.mean(sRich_MacIntercept_Coeffs))  # List to hold coefficients

        Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals))
        Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs))

        Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals))
        Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs))

        Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals))
        Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs))

        Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals))
        Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs))

        Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals))
        Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs))

        Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals))
        Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs))

        Rare_MacSlope_pVals.append(
            np.mean(sRare_MacSlope_pVals))  # List to hold coefficient p-values
        Rare_MacSlope_Coeffs.append(
            np.mean(sRare_MacSlope_Coeffs))  # List to hold coefficients

        Rich_MacSlope_pVals.append(
            np.mean(sRich_MacSlope_pVals))  # List to hold coefficient p-values
        Rich_MacSlope_Coeffs.append(
            np.mean(sRich_MacSlope_Coeffs))  # List to hold coefficients

        Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals))
        Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs))

        Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals))
        Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs))

        Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals))
        Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs))

        Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals))
        Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs))

        Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals))
        Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs))

        Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals))
        Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs))

        RareR2List.append(np.mean(sRareR2List))
        RarepFList.append(np.mean(sRarepFList))
        RichR2List.append(np.mean(sRichR2List))
        RichpFList.append(np.mean(sRichpFList))
        DomR2List.append(np.mean(sDomR2List))
        DompFList.append(np.mean(sDompFList))
        EvenR2List.append(np.mean(sEvenR2List))
        EvenpFList.append(np.mean(sEvenpFList))

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #RarepLinListHC.append(np.mean(sRarepLinListHC))
        RarepLinListRainB.append(np.mean(sRarepLinListRainB))
        RarepLinListLM.append(np.mean(sRarepLinListLM))
        #RichpLinListHC.append(np.mean(sRichpLinListHC))
        RichpLinListRainB.append(np.mean(sRichpLinListRainB))
        RichpLinListLM.append(np.mean(sRichpLinListLM))
        #DompLinListHC.append(np.mean(sDompLinListHC))
        DompLinListRainB.append(np.mean(sDompLinListRainB))
        DompLinListLM.append(np.mean(sDompLinListLM))
        #EvenpLinListHC.append(np.mean(sEvenpLinListHC))
        EvenpLinListRainB.append(np.mean(sEvenpLinListRainB))
        EvenpLinListLM.append(np.mean(sEvenpLinListLM))

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        RarepCorrListBG.append(np.mean(sRarepCorrListBG))
        RarepCorrListF.append(np.mean(sRarepCorrListF))
        RichpCorrListBG.append(np.mean(sRichpCorrListBG))
        RichpCorrListF.append(np.mean(sRichpCorrListF))
        DompCorrListBG.append(np.mean(sDompCorrListBG))
        DompCorrListF.append(np.mean(sDompCorrListF))
        EvenpCorrListBG.append(np.mean(sEvenpCorrListBG))
        EvenpCorrListF.append(np.mean(sEvenpCorrListF))

        # 6. Homoscedacticity
        RarepHomoHW.append(np.mean(sRarepHomoHW))
        RarepHomoHB.append(np.mean(sRarepHomoHB))
        RichpHomoHB.append(np.mean(sRichpHomoHB))
        RichpHomoHW.append(np.mean(sRichpHomoHW))
        DompHomoHW.append(np.mean(sDompHomoHW))
        DompHomoHB.append(np.mean(sDompHomoHB))
        EvenpHomoHW.append(np.mean(sEvenpHomoHW))
        EvenpHomoHB.append(np.mean(sEvenpHomoHB))

        # 7. Normally distributed residuals (errors)
        RarepNormListOmni.append(np.mean(sRarepNormListOmni))
        RarepNormListJB.append(np.mean(sRarepNormListJB))
        RarepNormListKS.append(np.mean(sRarepNormListKS))
        RarepNormListAD.append(np.mean(sRarepNormListAD))

        RichpNormListOmni.append(np.mean(sRichpNormListOmni))
        RichpNormListJB.append(np.mean(sRichpNormListJB))
        RichpNormListKS.append(np.mean(sRichpNormListKS))
        RichpNormListAD.append(np.mean(sRichpNormListAD))

        DompNormListOmni.append(np.mean(sDompNormListOmni))
        DompNormListJB.append(np.mean(sDompNormListJB))
        DompNormListKS.append(np.mean(sDompNormListKS))
        DompNormListAD.append(np.mean(sDompNormListAD))

        EvenpNormListOmni.append(np.mean(sEvenpNormListOmni))
        EvenpNormListJB.append(np.mean(sEvenpNormListJB))
        EvenpNormListKS.append(np.mean(sEvenpNormListKS))
        EvenpNormListAD.append(np.mean(sEvenpNormListAD))

    fig.add_subplot(4, 3, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.ylim(0, 1)
    plt.xscale('log')
    # Rarity    R2 vs. Sample Size
    plt.plot(NLIST, RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16)
    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 2)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    plt.ylim(0.0, 0.16)
    # Rarity    Coeffs vs. Sample Size
    plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 3)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.ylim(0.0, 0.6)
    plt.xscale('log')
    # Rarity    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RarepLinListRainB,  c='m')
    plt.plot(NLIST, RarepLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RarepCorrListBG,  c='c')
    plt.plot(NLIST, RarepCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, RarepHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RarepHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, RarepNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RarepNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RarepNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RarepNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')

    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 4)
    plt.xscale('log')
    plt.ylim(0, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 5)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Dominance     Coeffs vs. Sample Size
    plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')

    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 6)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    #plt.yscale('log')
    plt.ylim(0, 0.6)
    # Dominance     p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, DompLinListRainB, c='m')
    plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, DompCorrListBG, c='c')
    plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, DompHomoHB, c='r',ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-')
    #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3)
    #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 7)
    plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16)
    plt.xscale('log')
    plt.ylim(0, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Evenness      R2 vs. Sample Size
    plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 8)
    plt.ylim(-0.25, 0.0)
    plt.xscale('log')
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Evenness      Coeffs vs. Sample Size
    plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 9)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    plt.ylim(0.0, 0.3)
    # Evenness      p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, EvenpLinListRainB, c='m')
    plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, EvenpCorrListBG, c='c')
    plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-')
    #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3)
    #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 10)
    plt.xscale('log')
    plt.ylim(0, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.xlabel('Sample size', fontsize=14)
    plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 11)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Richness    Coeffs vs. Sample Size
    plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    plt.xlabel('Sample size', fontsize=14)

    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 12)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    # Richness    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RichpLinListRainB,  c='m')
    plt.plot(NLIST, RichpLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RichpCorrListBG,  c='c')
    plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, RichpHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RichpHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, RichpNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RichpNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RichpNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RichpNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    plt.xlabel('Sample size', fontsize=14)
    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)
    #plt.tick_params(axis='both', which='major', labelsize=fs-3)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    plt.savefig(mydir + 'figs/appendix/SampleSize/SampleSizeEffects.png',
                dpi=600,
                bbox_inches="tight")
    #plt.close()
    #plt.show()

    return
Example #6
0
# Load data
url = 'http://vincentarelbundock.github.io/Rdatasets/csv/HistData/Guerry.csv'
dat = pd.read_csv(url)

# Fit regression model (using the natural log of one of the regressaors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()

# Inspect the results
print(results.summary())

# ## Normality of the residuals

# Jarque-Bera test:

name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(results.resid)
zip(name, test)

# Omni test:

name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
zip(name, test)

# ## Influence tests
#
# Once created, an object of class ``OLSInfluence`` holds attributes and methods that allow users to assess the influence of each observation. For example, we can compute and extract the first few rows of DFbetas by:

from statsmodels.stats.outliers_influence import OLSInfluence
test_class = OLSInfluence(results)
test_class.dfbetas[:5, :]
print(session.s1.describe())
print(session.s12.describe())
print(session.s2.describe())
print(session.on.describe())

#各種統計量の算出、比較
import statsmodels.api as sm
import numpy as np
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
print(sm.tsa.adfuller(session.s1, regression='nc')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s1, regression='c')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s1, regression='ct')[1])  #[1]はp値の検定結果
print(session.s1.mean() / session.s1.std() * np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s1)
print('s1: ', lzip(estimator, test))

print(sm.tsa.adfuller(session.s12, regression='nc')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12, regression='c')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s12, regression='ct')[1])  #[1]はp値の検定結果
print(session.s1.mean() / session.s12.std() * np.sqrt(session.s1.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
test = sms.jarque_bera(session.s12)
print('s12: ', lzip(estimator, test))

print(sm.tsa.adfuller(session.s2, regression='nc')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s2, regression='c')[1])  #[1]はp値の検定結果
print(sm.tsa.adfuller(session.s2, regression='ct')[1])  #[1]はp値の検定結果
print(session.s2.mean() / session.s2.std() * np.sqrt(session.s2.count()))
estimator = ['JB', 'Chi-squared p-value', 'Skew', 'Kurtosis']
#create data frame for refined clicks for all channels: use debug to show how the data frame is built

refinedClicksDf = pd.DataFrame(index=groupedDf.index.get_level_values('date').unique())
for c in channels:
    channelDF = pd.DataFrame(df3[df3.channel == c])
    channelSeries = channelDF.groupby(['date']).sum()
    refind = pd.DataFrame(channelSeries.refined_click).rename(columns={'refined_click': c + '_refined_clicks'})
    refinedClicksDf = pd.concat([refinedClicksDf, refind], join='outer', axis=1)

#some statistical insights
#correlation
stat = pd.stats.moments

refinedClicksDf['Amazon_refined_clicks'].corr(refinedClicksDf['SEO_refined_clicks'])
movingAve = stat.rolling_mean(refinedClicksDf['Amazon_refined_clicks'],10)
plt.figure
movingAve.plot()
plt.show(block=False)
#regression analysis
#install statsmodels : pip install statsmodels (might ask for more packages to install: e.g. scipy, patsy...)
import statsmodels.formula.api as smf
model = smf.OLS(refinedClicksDf['Amazon_refined_clicks'], refinedClicksDf['SEO_refined_clicks'])
res = model.fit()
print res.summary()


#test residuals normality
import statsmodels.stats.api as ss
ss.jarque_bera(res.resid)
Example #9
0
def main(processed_path = "data/processed",
         models_path = "models"):
    
    """Nested 10-fold cross-validation for linear regression of
    ranking_log and score with with lasso regularization
    (inner CV for alpha tuning, outer for R^2 robustness)."""
    
    # logging
    logger = logging.getLogger(__name__)
    
    # normalize paths
    processed_path = os.path.normpath(processed_path)
    logger.debug("Path to processed data normalized: {}"
                 .format(processed_path))
    models_path = os.path.normpath(models_path)
    logger.debug("Path to models normalized: {}"
                 .format(models_path))
    
    # load selected_df
    selected_df = pd.read_pickle(os.path.join(processed_path,
                                              'selected_df.pkl'))
    logger.info("Loaded selected_df. Shape of df: {}"
                .format(selected_df.shape))
    
    #%% split df into dependent and independent variables
    teams_df = selected_df.iloc[:, :9]
    y = selected_df.iloc[:, 9:10]
    X = selected_df.iloc[:, 10:]
    X_columns = X.columns
    X_index = X.index
    
    #%% standardize
    
    scaler = StandardScaler()
    not_standardize = ['core',
                       'visualization',
                       'machine_learning',
                       'deep_learning']
    X_standardized = scaler.fit_transform(X
                                          .drop(columns=not_standardize)
                                          .values)
    X_standardized = pd.DataFrame(X_standardized,
                                  index = X_index,
                                  columns = X_columns.drop(not_standardize))
    X_not_standardized = X[not_standardize]
    X = pd.concat([X_standardized, X_not_standardized], axis=1)
    logger.debug("After Standardization:\n{}".format(X.describe().to_string))
    
    #%% define hyperparameter
    
    start = time()

    L1_RATIOS = [1.0, .95, .7, .5, .3, .1]
    EPS = 0.001
    N_ALPHAS = 100
    ALPHAS = None
    # normalize data
    # If True, the regressors X will be normalized before regression by
    # subtracting the mean (column-wise) and dividing by the l2-norm in
    # order for each feature to have norm = 1.
    NORMALIZE = False
    MAX_ITER = 10000
    TOL = 0.0001
    CV = 20
    N_JOBS = 1
    RS = 1
    SELECTION = 'cyclic'
    
    logger.info("l1_ratio={}, eps={}, n_alphas={}, alphas={}, normalize={}"
                 .format(L1_RATIOS, EPS, N_ALPHAS, ALPHAS, NORMALIZE))
    logger.info("max_iter={}, tol={}, cv={}, n_jobs={}, rs={}, selection={}"
                 .format(MAX_ITER, TOL, CV, N_JOBS, RS, SELECTION))
    logger.debug("Try following L1-ratios: {}".format(L1_RATIOS))
    
    # print R^2 values for bounding alphas 0 and 1 to make sense of alphas
    logger.info("Bounding score: R^2 for alpha=0 and l1_ratio=0.5: {}"
                .format(ElasticNet(alpha=0, l1_ratio=.5,
                                   normalize=NORMALIZE, random_state=RS)
                        .fit(X.values, y.values)
                        .score(X.values, y.values)))
    logger.info("Bounding score: R^2 for alpha=1 and l1_ratio=0.5: {}"
                .format(ElasticNet(alpha=1, l1_ratio=.5,
                                   normalize=NORMALIZE, random_state=RS)
                        .fit(X.values, y.values)
                        .score(X.values, y.values)))
    
    #%% train model
    
    mod = ElasticNetCV(l1_ratio = L1_RATIOS,
                       eps = EPS,
                       n_alphas = N_ALPHAS,
                       alphas = ALPHAS,
                       normalize = NORMALIZE,
                       max_iter = MAX_ITER,
                       tol = TOL,
                       cv = CV,
                       n_jobs = N_JOBS,
                       random_state = RS,
                       selection = SELECTION)\
          .fit(X.values, y.values)
    
    # log some statistics
    best_r2 = mod.score(X.values, y.values)
    logger.info("best R^2 score: {:.2f}%".format(best_r2*100))
    best_l1_ratio = mod.l1_ratio_
    logger.info("best l1_ratio: {}".format(best_l1_ratio))
    best_alpha = mod.alpha_
    logger.info("best alpha: {:.3f}".format(best_alpha))
    alphas = mod.alphas_
    logger.debug("tested alphas:\n{}".format(alphas))
    coef = pd.Series(data=mod.coef_, index=X_columns)
    logger.debug("best coefficients:\n{}".format(coef))
#    mse_path = mod.mse_path_
    
    #%% Nested Cross-Validation to test robustness of R^2
    
    cv_results = cross_validate(ElasticNetCV(l1_ratio = L1_RATIOS,
                                             eps = EPS,
                                             n_alphas = N_ALPHAS,
                                             alphas = ALPHAS,
                                             normalize = NORMALIZE,
                                             max_iter = MAX_ITER,
                                             tol = TOL,
                                             cv = CV,
                                             n_jobs = N_JOBS,
                                             random_state = RS,
                                             selection = SELECTION),
                                X.values, y.values, cv=CV,
                                return_train_score=True, n_jobs=N_JOBS)
    logger.info("95% confidence intervall: {:.2f} +/- {:.2f} (mean +/- 2*std)"
                .format(cv_results['test_score'].mean(),
                        cv_results['test_score'].std()*2))
    logger.debug("Nested cross-validation results:\n{}"
                .format(pd.DataFrame(data=cv_results)))
    
    #%% Elastic Net regression with statsmodels for summary
    
    mod_sm = sm.OLS(y.values, sm.add_constant(pd.DataFrame(data=X.values,
                                                    columns=X_columns,
                                                    index=X_index)))\
          .fit_regularized(method='elastic_net',
                           alpha=best_alpha,
                           L1_wt=best_l1_ratio,
                           refit=True)
    res = mod_sm.summary().as_text()
    logger.info("ElasticNet regression of selected_df regarding ranking_log")
    logger.info("with alpha={:.5f} and L1_wt={}:\n{}"
                .format(best_alpha, best_l1_ratio, res))
    
    # Normality of residuals
    # Jarque-Bera test:
    name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    test = sms.jarque_bera(mod_sm.resid)
    logger.info("Jarque-Bera test: {}".format(lzip(name, test)))
    # Omni test:
    name = ['Chi^2', 'Two-tail probability']
    test = sms.omni_normtest(mod_sm.resid)
    logger.info("Omnibus test: {}".format(lzip(name, test)))
    
    # Multicollinearity
    # Conditional Number:
    logger.info("Conditional Number: {}"
                .format(np.linalg.cond(mod_sm.model.exog)))
    
    # Heteroskedasticity tests
    # Breush-Pagan test:
    name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
    test = sms.het_breuschpagan(mod_sm.resid, mod_sm.model.exog)
    logger.info("Breush-Pagan test: {}".format(lzip(name, test)))
    # Goldfeld-Quandt test
    name = ['F statistic', 'p-value']
    test = sms.het_goldfeldquandt(mod_sm.resid, mod_sm.model.exog)
    logger.info("Goldfeld-Quandt test: {}".format(lzip(name, test)))
    
    #%% export results as pickle file to models folder
    
    # pickle mod
    with open(os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'),
              'wb') as handle:
        pickle.dump(mod, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info("Saved elastic net model of sklearn to {}."
                .format(os.path.join(models_path,
                                     'sklearn_ElasticNetCV.pkl')))
    
    # pickle mod_sm
    with open(os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'),
              'wb') as handle:
        pickle.dump(mod_sm, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info("Saved elastic net model of statsmodels to {}."
                .format(os.path.join(models_path,
                                     'sm_OLS_fit_regularized.pkl')))
    
    # save res as .txt
    f = open(os.path.join(models_path,
                          'sm_OLS_fit_regularized_summary.txt'), "w+")
    f.write(res)
    f.close()
    
    
    #%% logging time passed
    end = time()
    time_passed = pd.Timedelta(seconds=end-start).round(freq='s')
    logger.info("Time needed to train Elastic Net Model: {}"
                .format(time_passed))