Example #1
0
def normality_test(y):

    # Realiza un test de normalidad de Kolmogorov-Smirnnof sobre la variable endógena y muestra una gráfica de
    # distribución

    # Parámetros:
    #       y (Series): Objeto Series de pandas con la variable endógena.

    # Devuelve:
    #       Nada.

    normality_test = kstest_normal(y, dist='norm')
    stats = ['KS Stat', 'KS-Test p-value']

    print("---Test de Kolmogorov-Smirnnof---")
    print(dict(zip(stats, normality_test)))
    if (normality_test[1] > 0.05):
        print(
            "Aceptamos la hipótesis nula, ergo la variable se distribuye según una normal."
        )
    else:
        print(
            "Fallamos al aceptar la hipótesis nula, ergo la variable no se distribuye según una normal."
        )
    sns.distplot(y)
    plt.show()
Example #2
0
def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['omnibus'] = stats.normaltest(data)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    
    # Or you can check for normality with Lilliefors-test
    ksStats, pVals['Lilliefors'] = kstest_normal(data)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    
    print(pVals)
    if pVals['omnibus'] > 0.05:
        print('Data are normally distributed')
def kstestnormal (spike_features):
    """
    Ranks features using a Lilliefors test for
    normality
    
    Parameters
    ----------
    spike_features : ndarray
        The spike features
        
    Returns
    -------
    index_features_sorted : ndarray
        The indices of selected features in order of decreasing KS test
        statistic
    """
    num_total_features =  np.shape(spike_features)[1]
    from statsmodels.stats.diagnostic import kstest_normal
    ksD = np.zeros(num_total_features)
    ksp = np.zeros(num_total_features)
    for f in range(num_total_features):
        ksD[f], ksp[f] = kstest_normal(spike_features[:,f], pvalmethod='approx')

    index_features_sorted = heapq.nlargest(num_total_features, range(len(ksD)), ksD.take)
    return index_features_sorted
Example #4
0
def main():
    procs = int(input("Напишите количество процессов, которые вы хотите использовать для обработки данных: "))
    ests = int(input("Напишите количество выборок, которое вы хотите сгенерировать для проведения кластеризации: "))
    iris, _ = fisher_iris()
    start_time = time.time()
    iris = np.hstack((np.arange(150).reshape((150, 1)), iris))
    iris_samples = iris_sep(iris, procs)
    multiprocessing.set_start_method('fork')
    queue = multiprocessing.Queue()
    processes = []
    for proc in range(procs):
        processes.append(multiprocessing.Process(target=make_all, args=(iris_samples[proc], queue, ests, procs)))
    for process in processes:
        process.start()
    total_samples = []
    for _ in range(procs):
        total_samples.append(queue.get())
    for process in processes:
        process.join()
    # Завершена работа процессов, последующее объединение подвыборок в выборку
    print('Завершение работы процессов')
    total_data = get_total_sample_data(procs, ests, total_samples)
    silhouette_list = get_silhouette_list(total_data)
    print('Список оценённых коэффициентов силуэта')
    sil_time_start = time.time()
    print(silhouette_list)
    print('Время кластеризации и расчёта коэффициентов силуэта для {0} процессов составило: {1}'.format(ests, sil_time_start - time.time()))
    print('Оценка нормальности распределения коэффициентов силуэта')
    print('Тест Харке-Бера')
    print(jarque_bera(silhouette_list))
    PrettyTable
    print('Тест Колмогорова-Смирнова')
    print(kstest_normal(silhouette_list, dist='norm', pvalmethod='approx'))
    # Объединение выборок из каждого потока, расчёт
    print("%s seconds" % (time.time() - start_time))
Example #5
0
def check_2samples(pd_serie1, pd_serie2, title=None):

    # pd_serie1 = sag['potencia_ls_l1']
    # pd_serie2 = sag['potencia_ls_l2']

    ## check normalidad
    pd_serie1.hist(bins=50)
    plt.title(title)
    plt.show()
    pd_serie2.hist(bins=50)
    plt.title(title)
    plt.show()

    #qqplot
    from statsmodels.graphics.gofplots import qqplot
    qqplot(pd_serie1, line='r')
    plt.title('qqplot pd_serie1')
    plt.show()

    qqplot(pd_serie2, line='r')
    plt.title('qqplot pd_serie2')
    plt.show()

    # tests results
    from scipy.stats import shapiro
    from statsmodels.stats.diagnostic import kstest_normal

    print('results normal tests')
    print('pd_serie1')
    print('shapiro p-value', shapiro(pd_serie1)[1])
    print('KS p-value', kstest_normal(pd_serie1, dist='norm')[1])
    print()

    print('pd_serie2')
    print('shapiro p-value', shapiro(pd_serie2)[1])
    print('KS p-value', kstest_normal(pd_serie2, dist='norm')[1])
    print()
    ## chequeo independencia

    ## correlations
    from scipy.stats import pearsonr
    from scipy.stats import spearmanr
    print('')
    print('Pearson Correlation:', pearsonr(pd_serie1, pd_serie2)[0])
    print('Spearman Correlation:', spearmanr(pd_serie1, pd_serie2)[0])
    plt.plot(pd_serie1, pd_serie2, '.')
    plt.title('distribucion ')
def test_residuals_normality(methods):
    threshold_p = 0.05

    print("> Kolmogorov-Smirnov test")
    for method in methods:
        ksstat, pvalue = kstest_normal(method.residuals,
                                       dist='norm',
                                       pvalmethod='table')
        print(
            f"{method.name}: ks={ksstat}, p={pvalue}. {'❌' if pvalue < threshold_p else '✅'}."
        )
Example #7
0
def check_normal(pd_serie, bins=None):

    # pd_serie = pd.Series(mod1.resid_pearson)
    # pd_serie = sag_l1['bwi_ls_l1']
    name = pd_serie.name

    if isinstance(name, (str)) != True:
        name = ''

    ## check normalidad
    sns.distplot(pd_serie, bins=bins)
    if name != '':
        plt.title(name)
    plt.show()

    #qqplot
    from statsmodels.graphics.gofplots import qqplot
    # import matplotlib.pyplot as plt
    qqplot(pd_serie, line='r')
    plt.title('qqplot ' + name)
    plt.show()

    # tests results
    from scipy.stats import shapiro
    from statsmodels.stats.diagnostic import kstest_normal

    shapiro_scipystats = shapiro(pd_serie)
    ktest_statsmodelsstatsdiagnostic = kstest_normal(pd_serie, dist='norm')

    print('results normal tests', name)
    print('shapiro p-value', shapiro_scipystats[1])
    print('KS p-value', ktest_statsmodelsstatsdiagnostic[1])
    print()

    print(
        'Test used scipy.stats.shapiro and statsmodels.stats.diagnostic.ktest_normal'
    )
    print()

    return {
        'shapiro': shapiro_scipystats,
        'ktest': ktest_statsmodelsstatsdiagnostic
    }
Example #8
0
def check_normal(pd_serie, bins=None):

    name = pd_serie.name

    if isinstance(name, (str)) != True:
        name = ''

    shapiro_scipystats = shapiro(pd_serie)
    ktest_statsmodelsstatsdiagnostic = kstest_normal(pd_serie, dist='norm')

    print('results normal tests', name)
    print('shapiro p-value', shapiro_scipystats[1])
    print('KS p-value', ktest_statsmodelsstatsdiagnostic[1])
    print()

    print(
        'Test used scipy.stats.shapiro and statsmodels.stats.diagnostic.ktest_normal'
    )
    print()

    return {
        'shapiro': shapiro_scipystats,
        'ktest': ktest_statsmodelsstatsdiagnostic
    }
Example #9
0
def entender_2017(df_2017):
    style.use('ggplot')
    altura = df_2017['Altura']
    print('Media de la estatura de los runningbacks de la temporada 2017: ')
    print(altura.loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 100)].mean())
    print('Mediana de la estatura de los runningbacks de la temporada 2017: ')
    print(altura.loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 100)].median())
    espacio = ' '
    fig = plt.figure()
    ax = fig.add_subplot(111)
    n, bins, patches = ax.hist(altura, bins = 9, color = 'orange', normed = True)
    x_min, x_max = min(bins), max(bins)
    lnspc = np.linspace(x_min, x_max, len(altura))
    mu, sigma = stats.norm.fit(altura)
    pdf = stats.norm.pdf(lnspc, mu, sigma)
    ax.plot(lnspc, pdf, color = 'dodgerblue')
    fig.subplots_adjust(bottom = 0.10)
    ax.text(x = 165, y = -0.008,
            s = '   ©Mauricio Mani' + espacio*150 + 'Source: NFL: www.nfl.com/players   ',
            fontsize = 12, color = '#f0f0f0', backgroundcolor = 'grey')
    ax.text(x = 168, y = 0.087, s = "Estatura de runningbacks activos del 2017 - NFL ",
               fontsize = 23, weight = 'bold', alpha = .75)
    ax.text(x = 167, y = 0.083, 
               s = 'Histograma de la estatura de corredores activos comparados con una distribución normal',
              fontsize = 16, alpha = .85)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    stats.probplot(altura, plot=plt)
    ax.set_title('Normal Q-Q plot')
    plt.show()
    #Prueba kolmogorov-smirnov para normailidad (bondad de ajuste), similar a Lilliefors, pero con distribución KS.
    ks, p_v = kstest_normal(altura)
    print('El valor de la prueba Kolmogorov-Smirnov es: ' + str(ks))
    print('El valor p de la prueba es: ' + str(p_v))
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(altura, bins = 9, cumulative=True, normed=True, histtype = 'step', linewidth=1.4)
    cdf = stats.norm.cdf(lnspc, mu, sigma)
    ax.plot(lnspc, cdf, color = 'dodgerblue')
    plt.xlim(xmin=x_min, xmax = x_max)
    fig.subplots_adjust(bottom = 0.10)
    ax.text(x = 166, y = -0.10,
            s = '   ©Mauricio Mani' + espacio*130 + 'Source: NFL: www.nfl.com/players   ',
            fontsize = 12, color = '#f0f0f0', backgroundcolor = 'grey')
    ax.text(x = 167, y = 1.12, s = "Distribución acumulada de la estatura de corredores 2017 - NFL",
               fontsize = 21, weight = 'bold', alpha = .75)
    ax.text(x = 170, y = 1.05, 
               s = 'Para uso de la prueba de normalidad Kolmogorov-Smirnov y Lilliefors.',
              fontsize = 16, alpha = .85)
    ax.text(x = 169.5, y = 0.8, 
               s = 'Prueba Kolmogorov-Smirnov: ' + str(ks),
              fontsize = 10)
    ax.text(x = 169.5, y = 0.75, 
               s = 'P-value: ' + str(p_v),
              fontsize = 10)
    sw, sw_pv = stats.shapiro(altura)
    print('El valor de la prueba Shapiro-Wilk es: ' + str(sw))
    print('El valor p de la prueba es: ' + str(sw_pv))
    plt.show() 
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.boxplot(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)])
    print('Estadisticas básicas: ')
    print('Media: ')
    print(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)].mean())
    print('Meidana: ')
    print(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)].median())
    print('Moda: ')
    print(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)].mode())
    #La mediana y la moda son muy similares.
    #El boxplot tamnien nos sirve para revisar la normalidad de un dataset.
    plt.xticks([1], ['Mas de 800 yardas'])
    fig.subplots_adjust(bottom = 0.10)
    ax.text(x = 0.5, y = 169.5,
            s = '   ©Mauricio Mani' + espacio*120 + 'Source: NFL: www.nfl.com/players   ',
            fontsize = 12, color = '#f0f0f0', backgroundcolor = 'grey')
    ax.text(x = 0.5, y = 195, s = "Entender como funcionan los diagramas de caja - Boxplot",
               fontsize = 22, weight = 'bold', alpha = .75)
    ax.text(x = 0.5, y = 193, 
               s = 'Con un ejemplo sencillo de la National Football League con la estatura de los corredores con mas\nde 800 yardas en la temporada 2017.',
              fontsize = 15, alpha = .85)
    ax.annotate(s = 'Esta es la mediana\n"NO es la media"', xy =(1.075, 180), xytext = (1.15, 180), arrowprops=dict(facecolor='red', color='red'))
    ax.annotate(s = 'Q1 o 25% de los datos', xy =(1.075, 177), xytext = (1.15, 177), arrowprops=dict(facecolor='red', color='red'))
    ax.annotate(s = 'Q3 o 75% de los datos', xy =(0.925, 182.6), xytext = (0.72, 182.6), arrowprops=dict(facecolor='red', color='red'))
    ax.annotate(s = 'Q3 + 1.5 * Rango Intercuatil', xy =(1.04, 185.9), xytext = (1.1, 185.9), arrowprops=dict(facecolor='red', color='red'))
    ax.annotate(s = 'Q1 - 1.5 * Rango Intercuartil', xy =(0.96, 172.5), xytext = (0.7, 172.5), arrowprops=dict(facecolor='red', color='red'))
    ax.annotate(s = 'Latavius Murray\nOutlier', xy =(1, 191.8), xytext = (0.85, 189), arrowprops=dict(facecolor='red', color='red'))
    return(df_2017.loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)])
def Fig_OLS_Checks():

    #fs = 10 # font size used across figures
    #color = str()
    #OrC = 'open'

    SampSizes = [5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    Iterations = 100

    fig = plt.figure(figsize=(12, 8))

    # MODEL PARAMETERS
    Rare_MacIntercept_pVals = [] # List to hold coefficient p-values
    Rare_MacIntercept_Coeffs = [] # List to hold coefficients

    Rich_MacIntercept_pVals = []
    Rich_MacIntercept_Coeffs = []

    Dom_MacIntercept_pVals = []
    Dom_MacIntercept_Coeffs = []

    Even_MacIntercept_pVals = []
    Even_MacIntercept_Coeffs = []

    Rare_MicIntercept_pVals = []
    Rare_MicIntercept_Coeffs = []

    Rich_MicIntercept_pVals = []
    Rich_MicIntercept_Coeffs = []

    Dom_MicIntercept_pVals = []
    Dom_MicIntercept_Coeffs = []

    Even_MicIntercept_pVals = []
    Even_MicIntercept_Coeffs = []


    Rare_MacSlope_pVals = []
    Rare_MacSlope_Coeffs = []

    Rich_MacSlope_pVals = []
    Rich_MacSlope_Coeffs = []

    Dom_MacSlope_pVals = []
    Dom_MacSlope_Coeffs = []

    Even_MacSlope_pVals = []
    Even_MacSlope_Coeffs = []

    Rare_MicSlope_pVals = []
    Rare_MicSlope_Coeffs = []

    Rich_MicSlope_pVals = []
    Rich_MicSlope_Coeffs = []

    Dom_MicSlope_pVals = []
    Dom_MicSlope_Coeffs = []

    Even_MicSlope_pVals = []
    Even_MicSlope_Coeffs = []


    RareR2List = [] # List to hold model R2
    RarepFList = [] # List to hold significance of model R2
    RichR2List = [] # List to hold model R2
    RichpFList = [] # List to hold significance of model R2
    DomR2List = [] # List to hold model R2
    DompFList = [] # List to hold significance of model R2
    EvenR2List = [] # List to hold model R2
    EvenpFList = [] # List to hold significance of model R2

    # ASSUMPTIONS OF LINEAR REGRESSION
    # 1. Error in predictor variables is negligible...presumably yes
    # 2. Variables are measured at the continuous level...yes

    # 3. The relationship is linear
    #RarepLinListHC = []
    RarepLinListRainB = []
    RarepLinListLM = []
    #RichpLinListHC = []
    RichpLinListRainB = []
    RichpLinListLM = []
    #DompLinListHC = []
    DompLinListRainB = []
    DompLinListLM = []
    #EvenpLinListHC = []
    EvenpLinListRainB = []
    EvenpLinListLM = []

    # 4. There are no significant outliers...need to find tests or measures

    # 5. Independence of observations (no serial correlation in residuals)
    RarepCorrListBG = []
    RarepCorrListF = []
    RichpCorrListBG = []
    RichpCorrListF = []
    DompCorrListBG = []
    DompCorrListF = []
    EvenpCorrListBG = []
    EvenpCorrListF = []

    # 6. Homoscedacticity
    RarepHomoHW = []
    RarepHomoHB = []
    RichpHomoHW = []
    RichpHomoHB = []
    DompHomoHW = []
    DompHomoHB = []
    EvenpHomoHW = []
    EvenpHomoHB = []

    # 7. Normally distributed residuals (errors)
    RarepNormListOmni = [] # Omnibus test for normality
    RarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    RarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    RichpNormListOmni = [] # Omnibus test for normality
    RichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    RichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    DompNormListOmni = [] # Omnibus test for normality
    DompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    DompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    DompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    EvenpNormListOmni = [] # Omnibus test for normality
    EvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
    EvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    EvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

    NLIST = []

    for SampSize in SampSizes:

        sRare_MacIntercept_pVals = [] # List to hold coefficient p-values
        sRare_MacIntercept_Coeffs = [] # List to hold coefficients

        sRich_MacIntercept_pVals = [] # List to hold coefficient p-values
        sRich_MacIntercept_Coeffs = [] # List to hold coefficients

        sDom_MacIntercept_pVals = []
        sDom_MacIntercept_Coeffs = []

        sEven_MacIntercept_pVals = []
        sEven_MacIntercept_Coeffs = []

        sRare_MicIntercept_pVals = []
        sRare_MicIntercept_Coeffs = []

        sRich_MicIntercept_pVals = []
        sRich_MicIntercept_Coeffs = []

        sDom_MicIntercept_pVals = []
        sDom_MicIntercept_Coeffs = []

        sEven_MicIntercept_pVals = []
        sEven_MicIntercept_Coeffs = []


        sRare_MacSlope_pVals = []
        sRare_MacSlope_Coeffs = []

        sRich_MacSlope_pVals = []
        sRich_MacSlope_Coeffs = []

        sDom_MacSlope_pVals = []
        sDom_MacSlope_Coeffs = []

        sEven_MacSlope_pVals = []
        sEven_MacSlope_Coeffs = []

        sRare_MicSlope_pVals = []
        sRare_MicSlope_Coeffs = []

        sRich_MicSlope_pVals = []
        sRich_MicSlope_Coeffs = []

        sDom_MicSlope_pVals = []
        sDom_MicSlope_Coeffs = []

        sEven_MicSlope_pVals = []
        sEven_MicSlope_Coeffs = []


        sRareR2List = [] # List to hold model R2
        sRarepFList = [] # List to hold significance of model R2
        sRichR2List = [] # List to hold model R2
        sRichpFList = [] # List to hold significance of model R2
        sDomR2List = [] # List to hold model R2
        sDompFList = [] # List to hold significance of model R2
        sEvenR2List = [] # List to hold model R2
        sEvenpFList = [] # List to hold significance of model R2

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #sRarepLinListHC = []
        sRarepLinListRainB = []
        sRarepLinListLM = []
        #sRichpLinListHC = []
        sRichpLinListRainB = []
        sRichpLinListLM = []
        #sDompLinListHC = []
        sDompLinListRainB = []
        sDompLinListLM = []
        #sEvenpLinListHC = []
        sEvenpLinListRainB = []
        sEvenpLinListLM = []

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        sRarepCorrListBG = []
        sRarepCorrListF = []
        sRichpCorrListBG = []
        sRichpCorrListF = []
        sDompCorrListBG = []
        sDompCorrListF = []
        sEvenpCorrListBG = []
        sEvenpCorrListF = []

        # 6. Homoscedacticity
        sRarepHomoHW = []
        sRarepHomoHB = []
        sRichpHomoHW = []
        sRichpHomoHB = []
        sDompHomoHW = []
        sDompHomoHB = []
        sEvenpHomoHW = []
        sEvenpHomoHB = []

        # 7. Normally distributed residuals (errors)
        sRarepNormListOmni = [] # Omnibus test for normality
        sRarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

        sRichpNormListOmni = [] # Omnibus test for normality
        sRichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

        sDompNormListOmni = [] # Omnibus test for normality
        sDompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sDompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sDompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance

        sEvenpNormListOmni = [] # Omnibus test for normality
        sEvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality
        sEvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sEvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance


        for iteration in range(Iterations):

            Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [[], [], [], [], [], [], []]
            klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [[], [], [], [], [], [], []]
            NmaxList, rareSkews, KindList = [[], [], []]
            NSlist = []

            ct = 0
            radDATA = []
            datasets = []
            GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST


            mlist = ['micro', 'macro']
            for m in mlist:
                for name in os.listdir(mydir +'data/'+m):
                    if name in GoodNames: pass
                    else: continue
                    path = mydir+'data/'+m+'/'+name+'/'+name+'-SADMetricData.txt'
                    num_lines = sum(1 for line in open(path))
                    datasets.append([name, m, num_lines])

            numMac = 0
            numMic = 0

            radDATA = []

            for d in datasets:

                name, kind, numlines = d
                lines = []
                lines = np.random.choice(range(1, numlines+1), SampSize, replace=True)

                path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

                #print name, kind, numlines, len(radDATA)

            for data in radDATA:

                data = data.split()
                if len(data) == 0:
                    print 'no data'
                    continue

                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data


                N = float(N)
                S = float(S)

                Nlist.append(float(np.log(N)))
                Slist.append(float(np.log(S)))
                NSlist.append(float(np.log(N/S)))

                Evarlist.append(float(np.log(float(Evar))))
                ESimplist.append(float(np.log(float(ESimp))))
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log(float(BP)*float(N))))
                EHeiplist.append(float(EHeip))

                # lines for the log-modulo transformation of skewnness
                skew = float(skew)
                sign = 1
                if skew < 0: sign = -1

                lms = np.log(np.abs(skew) + 1)
                lms = lms * sign
                #if lms > 3: print name, N, S
                rareSkews.append(float(lms))

                if kind == 'macro': numMac += 1
                elif kind == 'micro': numMic += 1

                ct+=1


            #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Rarity'] = list(rareSkews)
            d['Kind'] = list(KindList)

            RarityResults = smf.ols('Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Richness'] = list(Slist)
            d['Kind'] = list(KindList)

            RichnessResults = smf.ols('Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print RichnessResults.summary(), '\n'

            # Multiple regression for Dominance
            d = pd.DataFrame({'N': list(Nlist)})
            d['Dominance'] = list(NmaxList)
            d['Kind'] = list(KindList)

            DomResults = smf.ols('Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print DomResults.summary(), '\n'

            # Multiple regression for Evenness
            d = pd.DataFrame({'N': list(Nlist)})
            d['Evenness'] = list(ESimplist)
            d['Kind'] = list(KindList)

            EvenResults = smf.ols('Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            RareResids = RarityResults.resid # residuals of the model
            RichResids = RichnessResults.resid # residuals of the model
            DomResids = DomResults.resid # residuals of the model
            EvenResids = EvenResults.resid # residuals of the model

            # MODEL RESULTS/FIT
            RareFpval = RarityResults.f_pvalue
            Rarer2 = RarityResults.rsquared # coefficient of determination
            #Adj_r2 = RareResults.rsquared_adj # adjusted
            RichFpval = RichnessResults.f_pvalue
            Richr2 = RichnessResults.rsquared # coefficient of determination
            #Adj_r2 = RichnessResults.rsquared_adj # adjusted

            DomFpval = DomResults.f_pvalue
            Domr2 = DomResults.rsquared # coefficient of determination
            #Adj_r2 = DomResults.rsquared_adj # adjusted
            EvenFpval = EvenResults.f_pvalue
            Evenr2 = EvenResults.rsquared # coefficient of determination
            #Adj_r2 = EvenResuls.rsquared_adj # adjusted

            # MODEL PARAMETERS and p-values
            Rareparams = RarityResults.params
            Rareparams = Rareparams.tolist()
            Rarepvals = RarityResults.pvalues
            Rarepvals = Rarepvals.tolist()

            Richparams = RichnessResults.params
            Richparams = Richparams.tolist()
            Richpvals = RichnessResults.pvalues
            Richpvals = Richpvals.tolist()

            Domparams = DomResults.params
            Domparams = Domparams.tolist()
            Dompvals = DomResults.pvalues
            Dompvals = Dompvals.tolist()

            Evenparams = EvenResults.params
            Evenparams = Evenparams.tolist()
            Evenpvals = EvenResults.pvalues
            Evenpvals = Evenpvals.tolist()


            sRare_MacIntercept_pVals.append(Rarepvals[0])
            sRare_MacIntercept_Coeffs.append(Rareparams[0])

            sRich_MacIntercept_pVals.append(Rarepvals[0])
            sRich_MacIntercept_Coeffs.append(Rareparams[0])

            sDom_MacIntercept_pVals.append(Dompvals[0])
            sDom_MacIntercept_Coeffs.append(Domparams[0])

            sEven_MacIntercept_pVals.append(Evenpvals[0])
            sEven_MacIntercept_Coeffs.append(Evenparams[0])

            sRare_MicIntercept_pVals.append(Rarepvals[1])
            if Rarepvals[1] > 0.05:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])
            else:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])

            sRich_MicIntercept_pVals.append(Richpvals[1])
            if Richpvals[1] > 0.05:
                sRich_MicIntercept_Coeffs.append(Richparams[1])
            else:
                sRich_MicIntercept_Coeffs.append(Richparams[1])

            sDom_MicIntercept_pVals.append(Dompvals[1])
            if Dompvals[1] > 0.05:
                sDom_MicIntercept_Coeffs.append(Domparams[1])
            else:
                sDom_MicIntercept_Coeffs.append(Domparams[1])

            sEven_MicIntercept_pVals.append(Evenpvals[1])
            if Evenpvals[1] > 0.05:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])
            else:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])


            sRare_MacSlope_pVals.append(Rarepvals[2])
            sRare_MacSlope_Coeffs.append(Rareparams[2])

            sRich_MacSlope_pVals.append(Richpvals[2])
            sRich_MacSlope_Coeffs.append(Richparams[2])

            sDom_MacSlope_pVals.append(Dompvals[2])
            sDom_MacSlope_Coeffs.append(Domparams[2])

            sEven_MacSlope_pVals.append(Evenpvals[2])
            sEven_MacSlope_Coeffs.append(Evenparams[2])


            sRare_MicSlope_pVals.append(Rarepvals[3])
            if Rarepvals[3] > 0.05:
                sRare_MicSlope_Coeffs.append(Rareparams[3])
            else:
                sRare_MicSlope_Coeffs.append(Rareparams[3])

            sRich_MicSlope_pVals.append(Richpvals[3])
            if Richpvals[3] > 0.05:
                sRich_MicSlope_Coeffs.append(Richparams[3])
            else:
                sRich_MicSlope_Coeffs.append(Richparams[3])

            sDom_MicSlope_pVals.append(Dompvals[3])
            if Dompvals[3] > 0.05:
                sDom_MicSlope_Coeffs.append(Domparams[3])
            else:
                sDom_MicSlope_Coeffs.append(Domparams[3])

            sEven_MicSlope_pVals.append(Evenpvals[3])
            if Evenpvals[3] > 0.05:
                sEven_MicSlope_Coeffs.append(Evenparams[3])
            else:
                sEven_MicSlope_Coeffs.append(Evenparams[3])

            sRareR2List.append(Rarer2)
            sRarepFList.append(RareFpval)
            sRichR2List.append(Richr2)
            sRichpFList.append(RichFpval)
            sDomR2List.append(Domr2)
            sDompFList.append(DomFpval)
            sEvenR2List.append(Evenr2)
            sEvenpFList.append(EvenFpval)

            # TESTS OF LINEAR REGRESSION ASSUMPTIONS
            # Error in predictor variables is negligible...Presumably Yes
            # Variables are measured at the continuous level...Definitely Yes

            # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR
            #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sRarepLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sDompLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sEvenpLinListHC.append(HC)

            RB = smd.linear_rainbow(RarityResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRarepLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(RichnessResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRichpLinListRainB.append(RB[1])

            RB = smd.linear_rainbow(DomResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sDompLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(EvenResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sEvenpLinListRainB.append(RB[1])

            LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog) # Lagrangian multiplier test for linearity
            sRarepLinListLM.append(LM[1])
            LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog) # Lagrangian multiplier test for linearity
            sRichpLinListLM.append(LM[1])

            LM = smd.linear_lm(DomResults.resid, DomResults.model.exog) # Lagrangian multiplier test for linearity
            sDompLinListLM.append(LM[1])
            LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog) # Lagrangian multiplier test for linearity
            sEvenpLinListLM.append(LM[1])

            # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals)
            BGtest = smd.acorr_breush_godfrey(RarityResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True)
            sRarepCorrListBG.append(BGtest[1])
            sRarepCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(RichnessResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True)
            sRichpCorrListBG.append(BGtest[1])
            sRichpCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(DomResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True)
            sDompCorrListBG.append(BGtest[1])
            sDompCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(EvenResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
                                # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True)
            sEvenpCorrListBG.append(BGtest[1])
            sEvenpCorrListF.append(BGtest[3])

            # There are no significant outliers...Need tests or measures/metrics

            # HOMOSCEDASTICITY

            # These tests return:
            # 1. lagrange multiplier statistic,
            # 2. p-value of lagrange multiplier test,
            # 3. f-statistic of the hypothesis that the error variance does not depend on x,
            # 4. p-value for the f-statistic

            HW = sms.het_white(RareResids, RarityResults.model.exog)
            sRarepHomoHW.append(HW[3])
            HW = sms.het_white(RichResids, RichnessResults.model.exog)
            sRichpHomoHW.append(HW[3])

            HW = sms.het_white(DomResids, DomResults.model.exog)
            sDompHomoHW.append(HW[3])
            HW = sms.het_white(EvenResids, EvenResults.model.exog)
            sEvenpHomoHW.append(HW[3])

            HB = sms.het_breushpagan(RareResids, RarityResults.model.exog)
            sRarepHomoHB.append(HB[3])
            HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog)
            sRichpHomoHB.append(HB[3])

            HB = sms.het_breushpagan(DomResids, DomResults.model.exog)
            sDompHomoHB.append(HB[3])
            HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog)
            sEvenpHomoHB.append(HB[3])

            # 7. NORMALITY OF ERROR TERMS
            O = sms.omni_normtest(RareResids)
            sRarepNormListOmni.append(O[1])
            O = sms.omni_normtest(RichResids)
            sRichpNormListOmni.append(O[1])
            O = sms.omni_normtest(DomResids)
            sDompNormListOmni.append(O[1])
            O = sms.omni_normtest(EvenResids)
            sEvenpNormListOmni.append(O[1])

            JB = sms.jarque_bera(RareResids)
            sRarepNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(RichResids)
            sRichpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(DomResids)
            sDompNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(EvenResids)
            sEvenpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality

            KS = smd.kstest_normal(RareResids)
            sRarepNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(RichResids)
            sRichpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(DomResids)
            sDompNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(EvenResids)
            sEvenpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance

            AD = smd.normal_ad(RareResids)
            sRarepNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(RichResids)
            sRichpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(DomResids)
            sDompNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(EvenResids)
            sEvenpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance

            print 'Sample size:',SampSize, 'iteration:',iteration

        NLIST.append(SampSize)

        Rare_MacIntercept_pVals.append(np.mean(sRare_MacIntercept_pVals)) # List to hold coefficient p-values
        Rare_MacIntercept_Coeffs.append(np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients

        Rich_MacIntercept_pVals.append(np.mean(sRich_MacIntercept_pVals)) # List to hold coefficient p-values
        Rich_MacIntercept_Coeffs.append(np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients

        Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals))
        Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs))

        Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals))
        Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs))

        Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals))
        Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs))

        Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals))
        Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs))

        Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals))
        Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs))

        Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals))
        Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs))

        Rare_MacSlope_pVals.append(np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values
        Rare_MacSlope_Coeffs.append(np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients

        Rich_MacSlope_pVals.append(np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values
        Rich_MacSlope_Coeffs.append(np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients

        Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals))
        Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs))

        Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals))
        Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs))

        Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals))
        Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs))

        Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals))
        Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs))

        Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals))
        Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs))

        Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals))
        Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs))


        RareR2List.append(np.mean(sRareR2List))
        RarepFList.append(np.mean(sRarepFList))
        RichR2List.append(np.mean(sRichR2List))
        RichpFList.append(np.mean(sRichpFList))
        DomR2List.append(np.mean(sDomR2List))
        DompFList.append(np.mean(sDompFList))
        EvenR2List.append(np.mean(sEvenR2List))
        EvenpFList.append(np.mean(sEvenpFList))

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #RarepLinListHC.append(np.mean(sRarepLinListHC))
        RarepLinListRainB.append(np.mean(sRarepLinListRainB))
        RarepLinListLM.append(np.mean(sRarepLinListLM))
        #RichpLinListHC.append(np.mean(sRichpLinListHC))
        RichpLinListRainB.append(np.mean(sRichpLinListRainB))
        RichpLinListLM.append(np.mean(sRichpLinListLM))
        #DompLinListHC.append(np.mean(sDompLinListHC))
        DompLinListRainB.append(np.mean(sDompLinListRainB))
        DompLinListLM.append(np.mean(sDompLinListLM))
        #EvenpLinListHC.append(np.mean(sEvenpLinListHC))
        EvenpLinListRainB.append(np.mean(sEvenpLinListRainB))
        EvenpLinListLM.append(np.mean(sEvenpLinListLM))

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        RarepCorrListBG.append(np.mean(sRarepCorrListBG))
        RarepCorrListF.append(np.mean(sRarepCorrListF))
        RichpCorrListBG.append(np.mean(sRichpCorrListBG))
        RichpCorrListF.append(np.mean(sRichpCorrListF))
        DompCorrListBG.append(np.mean(sDompCorrListBG))
        DompCorrListF.append(np.mean(sDompCorrListF))
        EvenpCorrListBG.append(np.mean(sEvenpCorrListBG))
        EvenpCorrListF.append(np.mean(sEvenpCorrListF))

        # 6. Homoscedacticity
        RarepHomoHW.append(np.mean(sRarepHomoHW))
        RarepHomoHB.append(np.mean(sRarepHomoHB))
        RichpHomoHB.append(np.mean(sRichpHomoHB))
        RichpHomoHW.append(np.mean(sRichpHomoHW))
        DompHomoHW.append(np.mean(sDompHomoHW))
        DompHomoHB.append(np.mean(sDompHomoHB))
        EvenpHomoHW.append(np.mean(sEvenpHomoHW))
        EvenpHomoHB.append(np.mean(sEvenpHomoHB))

        # 7. Normally distributed residuals (errors)
        RarepNormListOmni.append(np.mean(sRarepNormListOmni))
        RarepNormListJB.append(np.mean(sRarepNormListJB))
        RarepNormListKS.append(np.mean(sRarepNormListKS))
        RarepNormListAD.append(np.mean(sRarepNormListAD))

        RichpNormListOmni.append(np.mean(sRichpNormListOmni))
        RichpNormListJB.append(np.mean(sRichpNormListJB))
        RichpNormListKS.append(np.mean(sRichpNormListKS))
        RichpNormListAD.append(np.mean(sRichpNormListAD))

        DompNormListOmni.append(np.mean(sDompNormListOmni))
        DompNormListJB.append(np.mean(sDompNormListJB))
        DompNormListKS.append(np.mean(sDompNormListKS))
        DompNormListAD.append(np.mean(sDompNormListAD))

        EvenpNormListOmni.append(np.mean(sEvenpNormListOmni))
        EvenpNormListJB.append(np.mean(sEvenpNormListJB))
        EvenpNormListKS.append(np.mean(sEvenpNormListKS))
        EvenpNormListAD.append(np.mean(sEvenpNormListAD))


    fig.add_subplot(4, 3, 1)
    plt.xlim(min(SampSizes)-1,max(SampSizes)+10)
    plt.ylim(0,1)
    plt.xscale('log')
    # Rarity    R2 vs. Sample Size
    plt.plot(NLIST,RareR2List,  c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16)
    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 2)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    plt.ylim(0.0, 0.16)
    # Rarity    Coeffs vs. Sample Size
    plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rare_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 3)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.ylim(0.0, 0.6)
    plt.xscale('log')
    # Rarity    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RarepLinListRainB,  c='m')
    plt.plot(NLIST,RarepLinListLM,  c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RarepCorrListBG,  c='c')
    plt.plot(NLIST,RarepCorrListF,  c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST,RarepHomoHW,  c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RarepHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST,RarepNormListOmni,  c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RarepNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RarepNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RarepNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')

    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 4)
    plt.xscale('log')
    plt.ylim(0,1)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 5)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Dominance     Coeffs vs. Sample Size
    plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Dom_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')

    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 6)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    #plt.yscale('log')
    plt.ylim(0, 0.6)
    # Dominance     p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, DompLinListRainB, c='m')
    plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, DompCorrListBG, c='c')
    plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, DompHomoHB, c='r',ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-')
    #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3)
    #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 7)
    plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16)
    plt.xscale('log')
    plt.ylim(0,1)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Evenness      R2 vs. Sample Size
    plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 8)
    plt.ylim(-0.25, 0.0)
    plt.xscale('log')
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Evenness      Coeffs vs. Sample Size
    plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Even_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 9)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    plt.ylim(0.0, 0.3)
    # Evenness      p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, EvenpLinListRainB, c='m')
    plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, EvenpCorrListBG, c='c')
    plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-')
    #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3)
    #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 10)
    plt.xscale('log')
    plt.ylim(0,1)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.xlabel('Sample size', fontsize=14)
    plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4,prop={'size':14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 11)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    # Richness    Coeffs vs. Sample Size
    plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rich_MacSlope_Coeffs,  c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    plt.xlabel('Sample size', fontsize=14)

    leg = plt.legend(loc=10,prop={'size':8})
    leg.draw_frame(False)


    fig.add_subplot(4, 3, 12)
    plt.xlim(min(SampSizes)-1, max(SampSizes)+10)
    plt.xscale('log')
    # Richness    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RichpLinListRainB,  c='m')
    plt.plot(NLIST,RichpLinListLM,  c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RichpCorrListBG,  c='c')
    plt.plot(NLIST, EvenpCorrListF,  c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST,RichpHomoHW,  c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RichpHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST,RichpNormListOmni,  c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RichpNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RichpNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RichpNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    plt.xlabel('Sample size', fontsize=14)
    leg = plt.legend(loc=1,prop={'size':8})
    leg.draw_frame(False)
    #plt.tick_params(axis='both', which='major', labelsize=fs-3)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    plt.savefig(mydir+'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches = "tight")
    #plt.close()
    #plt.show()

    return
def Fig_OLS_Checks():

    #fs = 10 # font size used across figures
    #color = str()
    #OrC = 'open'

    SampSizes = [
        5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100
    ]
    Iterations = 100

    fig = plt.figure(figsize=(12, 8))

    # MODEL PARAMETERS
    Rare_MacIntercept_pVals = []  # List to hold coefficient p-values
    Rare_MacIntercept_Coeffs = []  # List to hold coefficients

    Rich_MacIntercept_pVals = []
    Rich_MacIntercept_Coeffs = []

    Dom_MacIntercept_pVals = []
    Dom_MacIntercept_Coeffs = []

    Even_MacIntercept_pVals = []
    Even_MacIntercept_Coeffs = []

    Rare_MicIntercept_pVals = []
    Rare_MicIntercept_Coeffs = []

    Rich_MicIntercept_pVals = []
    Rich_MicIntercept_Coeffs = []

    Dom_MicIntercept_pVals = []
    Dom_MicIntercept_Coeffs = []

    Even_MicIntercept_pVals = []
    Even_MicIntercept_Coeffs = []

    Rare_MacSlope_pVals = []
    Rare_MacSlope_Coeffs = []

    Rich_MacSlope_pVals = []
    Rich_MacSlope_Coeffs = []

    Dom_MacSlope_pVals = []
    Dom_MacSlope_Coeffs = []

    Even_MacSlope_pVals = []
    Even_MacSlope_Coeffs = []

    Rare_MicSlope_pVals = []
    Rare_MicSlope_Coeffs = []

    Rich_MicSlope_pVals = []
    Rich_MicSlope_Coeffs = []

    Dom_MicSlope_pVals = []
    Dom_MicSlope_Coeffs = []

    Even_MicSlope_pVals = []
    Even_MicSlope_Coeffs = []

    RareR2List = []  # List to hold model R2
    RarepFList = []  # List to hold significance of model R2
    RichR2List = []  # List to hold model R2
    RichpFList = []  # List to hold significance of model R2
    DomR2List = []  # List to hold model R2
    DompFList = []  # List to hold significance of model R2
    EvenR2List = []  # List to hold model R2
    EvenpFList = []  # List to hold significance of model R2

    # ASSUMPTIONS OF LINEAR REGRESSION
    # 1. Error in predictor variables is negligible...presumably yes
    # 2. Variables are measured at the continuous level...yes

    # 3. The relationship is linear
    #RarepLinListHC = []
    RarepLinListRainB = []
    RarepLinListLM = []
    #RichpLinListHC = []
    RichpLinListRainB = []
    RichpLinListLM = []
    #DompLinListHC = []
    DompLinListRainB = []
    DompLinListLM = []
    #EvenpLinListHC = []
    EvenpLinListRainB = []
    EvenpLinListLM = []

    # 4. There are no significant outliers...need to find tests or measures

    # 5. Independence of observations (no serial correlation in residuals)
    RarepCorrListBG = []
    RarepCorrListF = []
    RichpCorrListBG = []
    RichpCorrListF = []
    DompCorrListBG = []
    DompCorrListF = []
    EvenpCorrListBG = []
    EvenpCorrListF = []

    # 6. Homoscedacticity
    RarepHomoHW = []
    RarepHomoHB = []
    RichpHomoHW = []
    RichpHomoHB = []
    DompHomoHW = []
    DompHomoHB = []
    EvenpHomoHW = []
    EvenpHomoHB = []

    # 7. Normally distributed residuals (errors)
    RarepNormListOmni = []  # Omnibus test for normality
    RarepNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    RarepNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RarepNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    RichpNormListOmni = []  # Omnibus test for normality
    RichpNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    RichpNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    RichpNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    DompNormListOmni = []  # Omnibus test for normality
    DompNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    DompNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    DompNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    EvenpNormListOmni = []  # Omnibus test for normality
    EvenpNormListJB = [
    ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
    EvenpNormListKS = [
    ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
    EvenpNormListAD = [
    ]  # Anderson-Darling test for normal distribution unknown mean and variance

    NLIST = []

    for SampSize in SampSizes:

        sRare_MacIntercept_pVals = []  # List to hold coefficient p-values
        sRare_MacIntercept_Coeffs = []  # List to hold coefficients

        sRich_MacIntercept_pVals = []  # List to hold coefficient p-values
        sRich_MacIntercept_Coeffs = []  # List to hold coefficients

        sDom_MacIntercept_pVals = []
        sDom_MacIntercept_Coeffs = []

        sEven_MacIntercept_pVals = []
        sEven_MacIntercept_Coeffs = []

        sRare_MicIntercept_pVals = []
        sRare_MicIntercept_Coeffs = []

        sRich_MicIntercept_pVals = []
        sRich_MicIntercept_Coeffs = []

        sDom_MicIntercept_pVals = []
        sDom_MicIntercept_Coeffs = []

        sEven_MicIntercept_pVals = []
        sEven_MicIntercept_Coeffs = []

        sRare_MacSlope_pVals = []
        sRare_MacSlope_Coeffs = []

        sRich_MacSlope_pVals = []
        sRich_MacSlope_Coeffs = []

        sDom_MacSlope_pVals = []
        sDom_MacSlope_Coeffs = []

        sEven_MacSlope_pVals = []
        sEven_MacSlope_Coeffs = []

        sRare_MicSlope_pVals = []
        sRare_MicSlope_Coeffs = []

        sRich_MicSlope_pVals = []
        sRich_MicSlope_Coeffs = []

        sDom_MicSlope_pVals = []
        sDom_MicSlope_Coeffs = []

        sEven_MicSlope_pVals = []
        sEven_MicSlope_Coeffs = []

        sRareR2List = []  # List to hold model R2
        sRarepFList = []  # List to hold significance of model R2
        sRichR2List = []  # List to hold model R2
        sRichpFList = []  # List to hold significance of model R2
        sDomR2List = []  # List to hold model R2
        sDompFList = []  # List to hold significance of model R2
        sEvenR2List = []  # List to hold model R2
        sEvenpFList = []  # List to hold significance of model R2

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #sRarepLinListHC = []
        sRarepLinListRainB = []
        sRarepLinListLM = []
        #sRichpLinListHC = []
        sRichpLinListRainB = []
        sRichpLinListLM = []
        #sDompLinListHC = []
        sDompLinListRainB = []
        sDompLinListLM = []
        #sEvenpLinListHC = []
        sEvenpLinListRainB = []
        sEvenpLinListLM = []

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        sRarepCorrListBG = []
        sRarepCorrListF = []
        sRichpCorrListBG = []
        sRichpCorrListF = []
        sDompCorrListBG = []
        sDompCorrListF = []
        sEvenpCorrListBG = []
        sEvenpCorrListF = []

        # 6. Homoscedacticity
        sRarepHomoHW = []
        sRarepHomoHB = []
        sRichpHomoHW = []
        sRichpHomoHB = []
        sDompHomoHW = []
        sDompHomoHB = []
        sEvenpHomoHW = []
        sEvenpHomoHB = []

        # 7. Normally distributed residuals (errors)
        sRarepNormListOmni = []  # Omnibus test for normality
        sRarepNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRarepNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRarepNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        sRichpNormListOmni = []  # Omnibus test for normality
        sRichpNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sRichpNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sRichpNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        sDompNormListOmni = []  # Omnibus test for normality
        sDompNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sDompNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sDompNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        sEvenpNormListOmni = []  # Omnibus test for normality
        sEvenpNormListJB = [
        ]  # Calculate residual skewness, kurtosis, and do the JB test for normality
        sEvenpNormListKS = [
        ]  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
        sEvenpNormListAD = [
        ]  # Anderson-Darling test for normal distribution unknown mean and variance

        for iteration in range(Iterations):

            Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [
                [], [], [], [], [], [], []
            ]
            klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [
                [], [], [], [], [], [], []
            ]
            NmaxList, rareSkews, KindList = [[], [], []]
            NSlist = []

            ct = 0
            radDATA = []
            datasets = []
            GoodNames = [
                'EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB',
                'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS',
                'CBC', 'MCDB', 'GENTRY', 'FIA'
            ]  # all microbe data is MGRAST

            mlist = ['micro', 'macro']
            for m in mlist:
                for name in os.listdir(mydir + 'data/' + m):
                    if name in GoodNames: pass
                    else: continue
                    path = mydir + 'data/' + m + '/' + name + '/' + name + '-SADMetricData.txt'
                    num_lines = sum(1 for line in open(path))
                    datasets.append([name, m, num_lines])

            numMac = 0
            numMic = 0

            radDATA = []

            for d in datasets:

                name, kind, numlines = d
                lines = []
                lines = np.random.choice(range(1, numlines + 1),
                                         SampSize,
                                         replace=True)

                path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt'

                for line in lines:
                    data = linecache.getline(path, line)
                    radDATA.append(data)

                #print name, kind, numlines, len(radDATA)

            for data in radDATA:

                data = data.split()
                if len(data) == 0:
                    print 'no data'
                    continue

                name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data

                N = float(N)
                S = float(S)

                Nlist.append(float(np.log(N)))
                Slist.append(float(np.log(S)))
                NSlist.append(float(np.log(N / S)))

                Evarlist.append(float(np.log(float(Evar))))
                ESimplist.append(float(np.log(float(ESimp))))
                KindList.append(kind)

                BPlist.append(float(BP))
                NmaxList.append(float(np.log(float(BP) * float(N))))
                EHeiplist.append(float(EHeip))

                # lines for the log-modulo transformation of skewnness
                skew = float(skew)
                sign = 1
                if skew < 0: sign = -1

                lms = np.log(np.abs(skew) + 1)
                lms = lms * sign
                #if lms > 3: print name, N, S
                rareSkews.append(float(lms))

                if kind == 'macro': numMac += 1
                elif kind == 'micro': numMic += 1

                ct += 1

            #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Rarity'] = list(rareSkews)
            d['Kind'] = list(KindList)

            RarityResults = smf.ols(
                'Rarity ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            # Multiple regression for Rarity
            d = pd.DataFrame({'N': list(Nlist)})
            d['Richness'] = list(Slist)
            d['Kind'] = list(KindList)

            RichnessResults = smf.ols(
                'Richness ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print RichnessResults.summary(), '\n'

            # Multiple regression for Dominance
            d = pd.DataFrame({'N': list(Nlist)})
            d['Dominance'] = list(NmaxList)
            d['Kind'] = list(KindList)

            DomResults = smf.ols(
                'Dominance ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print DomResults.summary(), '\n'

            # Multiple regression for Evenness
            d = pd.DataFrame({'N': list(Nlist)})
            d['Evenness'] = list(ESimplist)
            d['Kind'] = list(KindList)

            EvenResults = smf.ols(
                'Evenness ~ N * Kind',
                d).fit()  # Fit the dummy variable regression model
            #print RarityResults.summary(), '\n'

            RareResids = RarityResults.resid  # residuals of the model
            RichResids = RichnessResults.resid  # residuals of the model
            DomResids = DomResults.resid  # residuals of the model
            EvenResids = EvenResults.resid  # residuals of the model

            # MODEL RESULTS/FIT
            RareFpval = RarityResults.f_pvalue
            Rarer2 = RarityResults.rsquared  # coefficient of determination
            #Adj_r2 = RareResults.rsquared_adj # adjusted
            RichFpval = RichnessResults.f_pvalue
            Richr2 = RichnessResults.rsquared  # coefficient of determination
            #Adj_r2 = RichnessResults.rsquared_adj # adjusted

            DomFpval = DomResults.f_pvalue
            Domr2 = DomResults.rsquared  # coefficient of determination
            #Adj_r2 = DomResults.rsquared_adj # adjusted
            EvenFpval = EvenResults.f_pvalue
            Evenr2 = EvenResults.rsquared  # coefficient of determination
            #Adj_r2 = EvenResuls.rsquared_adj # adjusted

            # MODEL PARAMETERS and p-values
            Rareparams = RarityResults.params
            Rareparams = Rareparams.tolist()
            Rarepvals = RarityResults.pvalues
            Rarepvals = Rarepvals.tolist()

            Richparams = RichnessResults.params
            Richparams = Richparams.tolist()
            Richpvals = RichnessResults.pvalues
            Richpvals = Richpvals.tolist()

            Domparams = DomResults.params
            Domparams = Domparams.tolist()
            Dompvals = DomResults.pvalues
            Dompvals = Dompvals.tolist()

            Evenparams = EvenResults.params
            Evenparams = Evenparams.tolist()
            Evenpvals = EvenResults.pvalues
            Evenpvals = Evenpvals.tolist()

            sRare_MacIntercept_pVals.append(Rarepvals[0])
            sRare_MacIntercept_Coeffs.append(Rareparams[0])

            sRich_MacIntercept_pVals.append(Rarepvals[0])
            sRich_MacIntercept_Coeffs.append(Rareparams[0])

            sDom_MacIntercept_pVals.append(Dompvals[0])
            sDom_MacIntercept_Coeffs.append(Domparams[0])

            sEven_MacIntercept_pVals.append(Evenpvals[0])
            sEven_MacIntercept_Coeffs.append(Evenparams[0])

            sRare_MicIntercept_pVals.append(Rarepvals[1])
            if Rarepvals[1] > 0.05:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])
            else:
                sRare_MicIntercept_Coeffs.append(Rareparams[1])

            sRich_MicIntercept_pVals.append(Richpvals[1])
            if Richpvals[1] > 0.05:
                sRich_MicIntercept_Coeffs.append(Richparams[1])
            else:
                sRich_MicIntercept_Coeffs.append(Richparams[1])

            sDom_MicIntercept_pVals.append(Dompvals[1])
            if Dompvals[1] > 0.05:
                sDom_MicIntercept_Coeffs.append(Domparams[1])
            else:
                sDom_MicIntercept_Coeffs.append(Domparams[1])

            sEven_MicIntercept_pVals.append(Evenpvals[1])
            if Evenpvals[1] > 0.05:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])
            else:
                sEven_MicIntercept_Coeffs.append(Evenparams[1])

            sRare_MacSlope_pVals.append(Rarepvals[2])
            sRare_MacSlope_Coeffs.append(Rareparams[2])

            sRich_MacSlope_pVals.append(Richpvals[2])
            sRich_MacSlope_Coeffs.append(Richparams[2])

            sDom_MacSlope_pVals.append(Dompvals[2])
            sDom_MacSlope_Coeffs.append(Domparams[2])

            sEven_MacSlope_pVals.append(Evenpvals[2])
            sEven_MacSlope_Coeffs.append(Evenparams[2])

            sRare_MicSlope_pVals.append(Rarepvals[3])
            if Rarepvals[3] > 0.05:
                sRare_MicSlope_Coeffs.append(Rareparams[3])
            else:
                sRare_MicSlope_Coeffs.append(Rareparams[3])

            sRich_MicSlope_pVals.append(Richpvals[3])
            if Richpvals[3] > 0.05:
                sRich_MicSlope_Coeffs.append(Richparams[3])
            else:
                sRich_MicSlope_Coeffs.append(Richparams[3])

            sDom_MicSlope_pVals.append(Dompvals[3])
            if Dompvals[3] > 0.05:
                sDom_MicSlope_Coeffs.append(Domparams[3])
            else:
                sDom_MicSlope_Coeffs.append(Domparams[3])

            sEven_MicSlope_pVals.append(Evenpvals[3])
            if Evenpvals[3] > 0.05:
                sEven_MicSlope_Coeffs.append(Evenparams[3])
            else:
                sEven_MicSlope_Coeffs.append(Evenparams[3])

            sRareR2List.append(Rarer2)
            sRarepFList.append(RareFpval)
            sRichR2List.append(Richr2)
            sRichpFList.append(RichFpval)
            sDomR2List.append(Domr2)
            sDompFList.append(DomFpval)
            sEvenR2List.append(Evenr2)
            sEvenpFList.append(EvenFpval)

            # TESTS OF LINEAR REGRESSION ASSUMPTIONS
            # Error in predictor variables is negligible...Presumably Yes
            # Variables are measured at the continuous level...Definitely Yes

            # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR
            #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sRarepLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sDompLinListHC.append(HC)
            #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            #sEvenpLinListHC.append(HC)

            RB = smd.linear_rainbow(
                RarityResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRarepLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(
                RichnessResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sRichpLinListRainB.append(RB[1])

            RB = smd.linear_rainbow(
                DomResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sDompLinListRainB.append(RB[1])
            RB = smd.linear_rainbow(
                EvenResults
            )  # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear.
            sEvenpLinListRainB.append(RB[1])

            LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sRarepLinListLM.append(LM[1])
            LM = smd.linear_lm(RichnessResults.resid,
                               RichnessResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sRichpLinListLM.append(LM[1])

            LM = smd.linear_lm(DomResults.resid, DomResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sDompLinListLM.append(LM[1])
            LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog
                               )  # Lagrangian multiplier test for linearity
            sEvenpLinListLM.append(LM[1])

            # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals)
            BGtest = smd.acorr_breush_godfrey(
                RarityResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True)
            sRarepCorrListBG.append(BGtest[1])
            sRarepCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(
                RichnessResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True)
            sRichpCorrListBG.append(BGtest[1])
            sRichpCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(
                DomResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True)
            sDompCorrListBG.append(BGtest[1])
            sDompCorrListF.append(BGtest[3])

            BGtest = smd.acorr_breush_godfrey(
                EvenResults, nlags=None, store=False
            )  # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation
            # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test
            #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True)
            sEvenpCorrListBG.append(BGtest[1])
            sEvenpCorrListF.append(BGtest[3])

            # There are no significant outliers...Need tests or measures/metrics

            # HOMOSCEDASTICITY

            # These tests return:
            # 1. lagrange multiplier statistic,
            # 2. p-value of lagrange multiplier test,
            # 3. f-statistic of the hypothesis that the error variance does not depend on x,
            # 4. p-value for the f-statistic

            HW = sms.het_white(RareResids, RarityResults.model.exog)
            sRarepHomoHW.append(HW[3])
            HW = sms.het_white(RichResids, RichnessResults.model.exog)
            sRichpHomoHW.append(HW[3])

            HW = sms.het_white(DomResids, DomResults.model.exog)
            sDompHomoHW.append(HW[3])
            HW = sms.het_white(EvenResids, EvenResults.model.exog)
            sEvenpHomoHW.append(HW[3])

            HB = sms.het_breushpagan(RareResids, RarityResults.model.exog)
            sRarepHomoHB.append(HB[3])
            HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog)
            sRichpHomoHB.append(HB[3])

            HB = sms.het_breushpagan(DomResids, DomResults.model.exog)
            sDompHomoHB.append(HB[3])
            HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog)
            sEvenpHomoHB.append(HB[3])

            # 7. NORMALITY OF ERROR TERMS
            O = sms.omni_normtest(RareResids)
            sRarepNormListOmni.append(O[1])
            O = sms.omni_normtest(RichResids)
            sRichpNormListOmni.append(O[1])
            O = sms.omni_normtest(DomResids)
            sDompNormListOmni.append(O[1])
            O = sms.omni_normtest(EvenResids)
            sEvenpNormListOmni.append(O[1])

            JB = sms.jarque_bera(RareResids)
            sRarepNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(RichResids)
            sRichpNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(DomResids)
            sDompNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality
            JB = sms.jarque_bera(EvenResids)
            sEvenpNormListJB.append(
                JB[1]
            )  # Calculate residual skewness, kurtosis, and do the JB test for normality

            KS = smd.kstest_normal(RareResids)
            sRarepNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(RichResids)
            sRichpNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(DomResids)
            sDompNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance
            KS = smd.kstest_normal(EvenResids)
            sEvenpNormListKS.append(
                KS[1]
            )  # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance

            AD = smd.normal_ad(RareResids)
            sRarepNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(RichResids)
            sRichpNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(DomResids)
            sDompNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance
            AD = smd.normal_ad(EvenResids)
            sEvenpNormListAD.append(
                AD[1]
            )  # Anderson-Darling test for normal distribution unknown mean and variance

            print 'Sample size:', SampSize, 'iteration:', iteration

        NLIST.append(SampSize)

        Rare_MacIntercept_pVals.append(np.mean(
            sRare_MacIntercept_pVals))  # List to hold coefficient p-values
        Rare_MacIntercept_Coeffs.append(
            np.mean(sRare_MacIntercept_Coeffs))  # List to hold coefficients

        Rich_MacIntercept_pVals.append(np.mean(
            sRich_MacIntercept_pVals))  # List to hold coefficient p-values
        Rich_MacIntercept_Coeffs.append(
            np.mean(sRich_MacIntercept_Coeffs))  # List to hold coefficients

        Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals))
        Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs))

        Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals))
        Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs))

        Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals))
        Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs))

        Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals))
        Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs))

        Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals))
        Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs))

        Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals))
        Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs))

        Rare_MacSlope_pVals.append(
            np.mean(sRare_MacSlope_pVals))  # List to hold coefficient p-values
        Rare_MacSlope_Coeffs.append(
            np.mean(sRare_MacSlope_Coeffs))  # List to hold coefficients

        Rich_MacSlope_pVals.append(
            np.mean(sRich_MacSlope_pVals))  # List to hold coefficient p-values
        Rich_MacSlope_Coeffs.append(
            np.mean(sRich_MacSlope_Coeffs))  # List to hold coefficients

        Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals))
        Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs))

        Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals))
        Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs))

        Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals))
        Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs))

        Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals))
        Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs))

        Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals))
        Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs))

        Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals))
        Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs))

        RareR2List.append(np.mean(sRareR2List))
        RarepFList.append(np.mean(sRarepFList))
        RichR2List.append(np.mean(sRichR2List))
        RichpFList.append(np.mean(sRichpFList))
        DomR2List.append(np.mean(sDomR2List))
        DompFList.append(np.mean(sDompFList))
        EvenR2List.append(np.mean(sEvenR2List))
        EvenpFList.append(np.mean(sEvenpFList))

        # ASSUMPTIONS OF LINEAR REGRESSION
        # 1. Error in predictor variables is negligible...presumably yes
        # 2. Variables are measured at the continuous level...yes

        # 3. The relationship is linear
        #RarepLinListHC.append(np.mean(sRarepLinListHC))
        RarepLinListRainB.append(np.mean(sRarepLinListRainB))
        RarepLinListLM.append(np.mean(sRarepLinListLM))
        #RichpLinListHC.append(np.mean(sRichpLinListHC))
        RichpLinListRainB.append(np.mean(sRichpLinListRainB))
        RichpLinListLM.append(np.mean(sRichpLinListLM))
        #DompLinListHC.append(np.mean(sDompLinListHC))
        DompLinListRainB.append(np.mean(sDompLinListRainB))
        DompLinListLM.append(np.mean(sDompLinListLM))
        #EvenpLinListHC.append(np.mean(sEvenpLinListHC))
        EvenpLinListRainB.append(np.mean(sEvenpLinListRainB))
        EvenpLinListLM.append(np.mean(sEvenpLinListLM))

        # 4. There are no significant outliers...need to find tests or measures

        # 5. Independence of observations (no serial correlation in residuals)
        RarepCorrListBG.append(np.mean(sRarepCorrListBG))
        RarepCorrListF.append(np.mean(sRarepCorrListF))
        RichpCorrListBG.append(np.mean(sRichpCorrListBG))
        RichpCorrListF.append(np.mean(sRichpCorrListF))
        DompCorrListBG.append(np.mean(sDompCorrListBG))
        DompCorrListF.append(np.mean(sDompCorrListF))
        EvenpCorrListBG.append(np.mean(sEvenpCorrListBG))
        EvenpCorrListF.append(np.mean(sEvenpCorrListF))

        # 6. Homoscedacticity
        RarepHomoHW.append(np.mean(sRarepHomoHW))
        RarepHomoHB.append(np.mean(sRarepHomoHB))
        RichpHomoHB.append(np.mean(sRichpHomoHB))
        RichpHomoHW.append(np.mean(sRichpHomoHW))
        DompHomoHW.append(np.mean(sDompHomoHW))
        DompHomoHB.append(np.mean(sDompHomoHB))
        EvenpHomoHW.append(np.mean(sEvenpHomoHW))
        EvenpHomoHB.append(np.mean(sEvenpHomoHB))

        # 7. Normally distributed residuals (errors)
        RarepNormListOmni.append(np.mean(sRarepNormListOmni))
        RarepNormListJB.append(np.mean(sRarepNormListJB))
        RarepNormListKS.append(np.mean(sRarepNormListKS))
        RarepNormListAD.append(np.mean(sRarepNormListAD))

        RichpNormListOmni.append(np.mean(sRichpNormListOmni))
        RichpNormListJB.append(np.mean(sRichpNormListJB))
        RichpNormListKS.append(np.mean(sRichpNormListKS))
        RichpNormListAD.append(np.mean(sRichpNormListAD))

        DompNormListOmni.append(np.mean(sDompNormListOmni))
        DompNormListJB.append(np.mean(sDompNormListJB))
        DompNormListKS.append(np.mean(sDompNormListKS))
        DompNormListAD.append(np.mean(sDompNormListAD))

        EvenpNormListOmni.append(np.mean(sEvenpNormListOmni))
        EvenpNormListJB.append(np.mean(sEvenpNormListJB))
        EvenpNormListKS.append(np.mean(sEvenpNormListKS))
        EvenpNormListAD.append(np.mean(sEvenpNormListAD))

    fig.add_subplot(4, 3, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.ylim(0, 1)
    plt.xscale('log')
    # Rarity    R2 vs. Sample Size
    plt.plot(NLIST, RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16)
    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 2)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    plt.ylim(0.0, 0.16)
    # Rarity    Coeffs vs. Sample Size
    plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 3)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.ylim(0.0, 0.6)
    plt.xscale('log')
    # Rarity    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RarepLinListRainB,  c='m')
    plt.plot(NLIST, RarepLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RarepCorrListBG,  c='c')
    plt.plot(NLIST, RarepCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, RarepHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RarepHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, RarepNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RarepNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RarepNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RarepNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')

    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 4)
    plt.xscale('log')
    plt.ylim(0, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 5)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Dominance     Coeffs vs. Sample Size
    plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')

    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 6)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    #plt.yscale('log')
    plt.ylim(0, 0.6)
    # Dominance     p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, DompLinListRainB, c='m')
    plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, DompCorrListBG, c='c')
    plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, DompHomoHB, c='r',ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-')
    #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3)
    #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 7)
    plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16)
    plt.xscale('log')
    plt.ylim(0, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Evenness      R2 vs. Sample Size
    plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 8)
    plt.ylim(-0.25, 0.0)
    plt.xscale('log')
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Evenness      Coeffs vs. Sample Size
    plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 9)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    plt.ylim(0.0, 0.3)
    # Evenness      p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST, EvenpLinListRainB, c='m')
    plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST, EvenpCorrListBG, c='c')
    plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-')
    #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3)
    #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 10)
    plt.xscale('log')
    plt.ylim(0, 1)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Dominance     R2 vs. Sample Size
    plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$')
    plt.ylabel(r'$R^2$', fontsize=14)
    plt.xlabel('Sample size', fontsize=14)
    plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16)

    leg = plt.legend(loc=4, prop={'size': 14})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 11)
    plt.ylim(-0.2, 1.2)
    plt.xscale('log')
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    # Richness    Coeffs vs. Sample Size
    plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe')
    plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe')
    #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction')
    plt.ylabel('Coefficient')
    plt.xlabel('Sample size', fontsize=14)

    leg = plt.legend(loc=10, prop={'size': 8})
    leg.draw_frame(False)

    fig.add_subplot(4, 3, 12)
    plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10)
    plt.xscale('log')
    # Richness    p-vals vs. Sample Size

    # 3. The relationship is linear
    #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8)
    #plt.plot(NLIST,RichpLinListRainB,  c='m')
    plt.plot(NLIST, RichpLinListLM, c='m', ls='-', label='linearity')

    # 5. Independence of observations (no serial correlation in residuals)
    #plt.plot(NLIST,RichpCorrListBG,  c='c')
    plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation')

    # 6. Homoscedacticity
    plt.plot(NLIST, RichpHomoHW, c='orange', ls='-', label='homoscedasticity')
    #plt.plot(NLIST,RichpHomoHB,  c='r', ls='-')

    # 7. Normally distributed residuals (errors)
    plt.plot(NLIST, RichpNormListOmni, c='Lime', ls='-', label='normality')
    #plt.plot(NLIST,RichpNormListJB,  c='Lime', ls='-')
    #plt.plot(NLIST,RichpNormListKS,  c='Lime', ls='--', lw=3)
    #plt.plot(NLIST,RichpNormListAD,  c='Lime', ls='--')

    plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--')
    plt.ylabel('p-value')
    plt.xlabel('Sample size', fontsize=14)
    leg = plt.legend(loc=1, prop={'size': 8})
    leg.draw_frame(False)
    #plt.tick_params(axis='both', which='major', labelsize=fs-3)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    plt.savefig(mydir + 'figs/appendix/SampleSize/SampleSizeEffects.png',
                dpi=600,
                bbox_inches="tight")
    #plt.close()
    #plt.show()

    return
Example #12
0
def check_normality(df, variable, alpha=0.05):
    ##use only KS or Anderson Starling also?
    ks, p = kstest_normal(df[variable])
    if p>alpha:
        return True
    return False