def normality_test(y): # Realiza un test de normalidad de Kolmogorov-Smirnnof sobre la variable endógena y muestra una gráfica de # distribución # Parámetros: # y (Series): Objeto Series de pandas con la variable endógena. # Devuelve: # Nada. normality_test = kstest_normal(y, dist='norm') stats = ['KS Stat', 'KS-Test p-value'] print("---Test de Kolmogorov-Smirnnof---") print(dict(zip(stats, normality_test))) if (normality_test[1] > 0.05): print( "Aceptamos la hipótesis nula, ergo la variable se distribuye según una normal." ) else: print( "Fallamos al aceptar la hipótesis nula, ergo la variable no se distribuye según una normal." ) sns.distplot(y) plt.show()
def check_normality(): '''Check if the distribution is normal.''' # Generate and show a distribution numData = 100 # To get reproducable values, I provide a seed value np.random.seed(987654321) data = stats.norm.rvs(myMean, mySD, size=numData) plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['omnibus'] = stats.normaltest(data) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) # Or you can check for normality with Lilliefors-test ksStats, pVals['Lilliefors'] = kstest_normal(data) # Alternatively with original Kolmogorov-Smirnov test _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') print(pVals) if pVals['omnibus'] > 0.05: print('Data are normally distributed')
def kstestnormal (spike_features): """ Ranks features using a Lilliefors test for normality Parameters ---------- spike_features : ndarray The spike features Returns ------- index_features_sorted : ndarray The indices of selected features in order of decreasing KS test statistic """ num_total_features = np.shape(spike_features)[1] from statsmodels.stats.diagnostic import kstest_normal ksD = np.zeros(num_total_features) ksp = np.zeros(num_total_features) for f in range(num_total_features): ksD[f], ksp[f] = kstest_normal(spike_features[:,f], pvalmethod='approx') index_features_sorted = heapq.nlargest(num_total_features, range(len(ksD)), ksD.take) return index_features_sorted
def main(): procs = int(input("Напишите количество процессов, которые вы хотите использовать для обработки данных: ")) ests = int(input("Напишите количество выборок, которое вы хотите сгенерировать для проведения кластеризации: ")) iris, _ = fisher_iris() start_time = time.time() iris = np.hstack((np.arange(150).reshape((150, 1)), iris)) iris_samples = iris_sep(iris, procs) multiprocessing.set_start_method('fork') queue = multiprocessing.Queue() processes = [] for proc in range(procs): processes.append(multiprocessing.Process(target=make_all, args=(iris_samples[proc], queue, ests, procs))) for process in processes: process.start() total_samples = [] for _ in range(procs): total_samples.append(queue.get()) for process in processes: process.join() # Завершена работа процессов, последующее объединение подвыборок в выборку print('Завершение работы процессов') total_data = get_total_sample_data(procs, ests, total_samples) silhouette_list = get_silhouette_list(total_data) print('Список оценённых коэффициентов силуэта') sil_time_start = time.time() print(silhouette_list) print('Время кластеризации и расчёта коэффициентов силуэта для {0} процессов составило: {1}'.format(ests, sil_time_start - time.time())) print('Оценка нормальности распределения коэффициентов силуэта') print('Тест Харке-Бера') print(jarque_bera(silhouette_list)) PrettyTable print('Тест Колмогорова-Смирнова') print(kstest_normal(silhouette_list, dist='norm', pvalmethod='approx')) # Объединение выборок из каждого потока, расчёт print("%s seconds" % (time.time() - start_time))
def check_2samples(pd_serie1, pd_serie2, title=None): # pd_serie1 = sag['potencia_ls_l1'] # pd_serie2 = sag['potencia_ls_l2'] ## check normalidad pd_serie1.hist(bins=50) plt.title(title) plt.show() pd_serie2.hist(bins=50) plt.title(title) plt.show() #qqplot from statsmodels.graphics.gofplots import qqplot qqplot(pd_serie1, line='r') plt.title('qqplot pd_serie1') plt.show() qqplot(pd_serie2, line='r') plt.title('qqplot pd_serie2') plt.show() # tests results from scipy.stats import shapiro from statsmodels.stats.diagnostic import kstest_normal print('results normal tests') print('pd_serie1') print('shapiro p-value', shapiro(pd_serie1)[1]) print('KS p-value', kstest_normal(pd_serie1, dist='norm')[1]) print() print('pd_serie2') print('shapiro p-value', shapiro(pd_serie2)[1]) print('KS p-value', kstest_normal(pd_serie2, dist='norm')[1]) print() ## chequeo independencia ## correlations from scipy.stats import pearsonr from scipy.stats import spearmanr print('') print('Pearson Correlation:', pearsonr(pd_serie1, pd_serie2)[0]) print('Spearman Correlation:', spearmanr(pd_serie1, pd_serie2)[0]) plt.plot(pd_serie1, pd_serie2, '.') plt.title('distribucion ')
def test_residuals_normality(methods): threshold_p = 0.05 print("> Kolmogorov-Smirnov test") for method in methods: ksstat, pvalue = kstest_normal(method.residuals, dist='norm', pvalmethod='table') print( f"{method.name}: ks={ksstat}, p={pvalue}. {'❌' if pvalue < threshold_p else '✅'}." )
def check_normal(pd_serie, bins=None): # pd_serie = pd.Series(mod1.resid_pearson) # pd_serie = sag_l1['bwi_ls_l1'] name = pd_serie.name if isinstance(name, (str)) != True: name = '' ## check normalidad sns.distplot(pd_serie, bins=bins) if name != '': plt.title(name) plt.show() #qqplot from statsmodels.graphics.gofplots import qqplot # import matplotlib.pyplot as plt qqplot(pd_serie, line='r') plt.title('qqplot ' + name) plt.show() # tests results from scipy.stats import shapiro from statsmodels.stats.diagnostic import kstest_normal shapiro_scipystats = shapiro(pd_serie) ktest_statsmodelsstatsdiagnostic = kstest_normal(pd_serie, dist='norm') print('results normal tests', name) print('shapiro p-value', shapiro_scipystats[1]) print('KS p-value', ktest_statsmodelsstatsdiagnostic[1]) print() print( 'Test used scipy.stats.shapiro and statsmodels.stats.diagnostic.ktest_normal' ) print() return { 'shapiro': shapiro_scipystats, 'ktest': ktest_statsmodelsstatsdiagnostic }
def check_normal(pd_serie, bins=None): name = pd_serie.name if isinstance(name, (str)) != True: name = '' shapiro_scipystats = shapiro(pd_serie) ktest_statsmodelsstatsdiagnostic = kstest_normal(pd_serie, dist='norm') print('results normal tests', name) print('shapiro p-value', shapiro_scipystats[1]) print('KS p-value', ktest_statsmodelsstatsdiagnostic[1]) print() print( 'Test used scipy.stats.shapiro and statsmodels.stats.diagnostic.ktest_normal' ) print() return { 'shapiro': shapiro_scipystats, 'ktest': ktest_statsmodelsstatsdiagnostic }
def entender_2017(df_2017): style.use('ggplot') altura = df_2017['Altura'] print('Media de la estatura de los runningbacks de la temporada 2017: ') print(altura.loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 100)].mean()) print('Mediana de la estatura de los runningbacks de la temporada 2017: ') print(altura.loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 100)].median()) espacio = ' ' fig = plt.figure() ax = fig.add_subplot(111) n, bins, patches = ax.hist(altura, bins = 9, color = 'orange', normed = True) x_min, x_max = min(bins), max(bins) lnspc = np.linspace(x_min, x_max, len(altura)) mu, sigma = stats.norm.fit(altura) pdf = stats.norm.pdf(lnspc, mu, sigma) ax.plot(lnspc, pdf, color = 'dodgerblue') fig.subplots_adjust(bottom = 0.10) ax.text(x = 165, y = -0.008, s = ' ©Mauricio Mani' + espacio*150 + 'Source: NFL: www.nfl.com/players ', fontsize = 12, color = '#f0f0f0', backgroundcolor = 'grey') ax.text(x = 168, y = 0.087, s = "Estatura de runningbacks activos del 2017 - NFL ", fontsize = 23, weight = 'bold', alpha = .75) ax.text(x = 167, y = 0.083, s = 'Histograma de la estatura de corredores activos comparados con una distribución normal', fontsize = 16, alpha = .85) fig = plt.figure() ax = fig.add_subplot(111) stats.probplot(altura, plot=plt) ax.set_title('Normal Q-Q plot') plt.show() #Prueba kolmogorov-smirnov para normailidad (bondad de ajuste), similar a Lilliefors, pero con distribución KS. ks, p_v = kstest_normal(altura) print('El valor de la prueba Kolmogorov-Smirnov es: ' + str(ks)) print('El valor p de la prueba es: ' + str(p_v)) fig = plt.figure() ax = fig.add_subplot(111) ax.hist(altura, bins = 9, cumulative=True, normed=True, histtype = 'step', linewidth=1.4) cdf = stats.norm.cdf(lnspc, mu, sigma) ax.plot(lnspc, cdf, color = 'dodgerblue') plt.xlim(xmin=x_min, xmax = x_max) fig.subplots_adjust(bottom = 0.10) ax.text(x = 166, y = -0.10, s = ' ©Mauricio Mani' + espacio*130 + 'Source: NFL: www.nfl.com/players ', fontsize = 12, color = '#f0f0f0', backgroundcolor = 'grey') ax.text(x = 167, y = 1.12, s = "Distribución acumulada de la estatura de corredores 2017 - NFL", fontsize = 21, weight = 'bold', alpha = .75) ax.text(x = 170, y = 1.05, s = 'Para uso de la prueba de normalidad Kolmogorov-Smirnov y Lilliefors.', fontsize = 16, alpha = .85) ax.text(x = 169.5, y = 0.8, s = 'Prueba Kolmogorov-Smirnov: ' + str(ks), fontsize = 10) ax.text(x = 169.5, y = 0.75, s = 'P-value: ' + str(p_v), fontsize = 10) sw, sw_pv = stats.shapiro(altura) print('El valor de la prueba Shapiro-Wilk es: ' + str(sw)) print('El valor p de la prueba es: ' + str(sw_pv)) plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.boxplot(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)]) print('Estadisticas básicas: ') print('Media: ') print(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)].mean()) print('Meidana: ') print(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)].median()) print('Moda: ') print(df_2017['Altura'].loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)].mode()) #La mediana y la moda son muy similares. #El boxplot tamnien nos sirve para revisar la normalidad de un dataset. plt.xticks([1], ['Mas de 800 yardas']) fig.subplots_adjust(bottom = 0.10) ax.text(x = 0.5, y = 169.5, s = ' ©Mauricio Mani' + espacio*120 + 'Source: NFL: www.nfl.com/players ', fontsize = 12, color = '#f0f0f0', backgroundcolor = 'grey') ax.text(x = 0.5, y = 195, s = "Entender como funcionan los diagramas de caja - Boxplot", fontsize = 22, weight = 'bold', alpha = .75) ax.text(x = 0.5, y = 193, s = 'Con un ejemplo sencillo de la National Football League con la estatura de los corredores con mas\nde 800 yardas en la temporada 2017.', fontsize = 15, alpha = .85) ax.annotate(s = 'Esta es la mediana\n"NO es la media"', xy =(1.075, 180), xytext = (1.15, 180), arrowprops=dict(facecolor='red', color='red')) ax.annotate(s = 'Q1 o 25% de los datos', xy =(1.075, 177), xytext = (1.15, 177), arrowprops=dict(facecolor='red', color='red')) ax.annotate(s = 'Q3 o 75% de los datos', xy =(0.925, 182.6), xytext = (0.72, 182.6), arrowprops=dict(facecolor='red', color='red')) ax.annotate(s = 'Q3 + 1.5 * Rango Intercuatil', xy =(1.04, 185.9), xytext = (1.1, 185.9), arrowprops=dict(facecolor='red', color='red')) ax.annotate(s = 'Q1 - 1.5 * Rango Intercuartil', xy =(0.96, 172.5), xytext = (0.7, 172.5), arrowprops=dict(facecolor='red', color='red')) ax.annotate(s = 'Latavius Murray\nOutlier', xy =(1, 191.8), xytext = (0.85, 189), arrowprops=dict(facecolor='red', color='red')) return(df_2017.loc[(df_2017['Status']=='ACT') & (df_2017['Yardas'] > 800)])
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [[], [], [], [], [], [], []] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [[], [], [], [], [], [], []] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir +'data/'+m): if name in GoodNames: pass else: continue path = mydir+'data/'+m+'/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines+1), SampSize, replace=True) path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N/S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP)*float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct+=1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols('Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols('Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols('Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols('Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow(RarityResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow(RichnessResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow(DomResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow(EvenResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey(RarityResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(RichnessResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(DomResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(EvenResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:',SampSize, 'iteration:',iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean(sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append(np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean(sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append(np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append(np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append(np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append(np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append(np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes)-1,max(SampSizes)+10) plt.ylim(0,1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST,RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST,RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST,RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST,RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir+'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches = "tight") #plt.close() #plt.show() return
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [ 5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100 ] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [ [], [], [], [], [], [], [] ] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [ [], [], [], [], [], [], [] ] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = [ 'EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA' ] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir + 'data/' + m): if name in GoodNames: pass else: continue path = mydir + 'data/' + m + '/' + name + '/' + name + '-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines + 1), SampSize, replace=True) path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N / S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP) * float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct += 1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols( 'Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols( 'Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols( 'Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols( 'Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow( RarityResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow( RichnessResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow( DomResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow( EvenResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog ) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog ) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog ) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog ) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey( RarityResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( RichnessResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( DomResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( EvenResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:', SampSize, 'iteration:', iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean( sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append( np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean( sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append( np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append( np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append( np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append( np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append( np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0, 1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST, RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST, RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST, RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST, RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir + 'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches="tight") #plt.close() #plt.show() return
def check_normality(df, variable, alpha=0.05): ##use only KS or Anderson Starling also? ks, p = kstest_normal(df[variable]) if p>alpha: return True return False