def test_normality(self): res = self.res #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test #> lt = lillie.test(residuals(fm)) #> mkhtest(lt, "lilliefors", "-") lilliefors1 = dict(statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)**2) #> mkhtest(lt, "lilliefors", "-") lilliefors2 = dict(statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)[1:20]) #> mkhtest(lt, "lilliefors", "-") lilliefors3 = dict(statistic=0.1333956004203103, pvalue=0.455683, parameters=(), distr='-') lf1 = smsdia.lilliefors(res.resid) lf2 = smsdia.lilliefors(res.resid**2) lf3 = smsdia.lilliefors(res.resid[:20]) compare_t_est(lf1, lilliefors1, decimal=(14, 14)) compare_t_est(lf2, lilliefors2, decimal=(14, 14)) # pvalue very small assert_allclose(lf2[1], lilliefors2['pvalue'], rtol=1e-10) compare_t_est(lf3, lilliefors3, decimal=(14, 1)) # R uses different approximation for pvalue in last case #> ad = ad.test(residuals(fm)) #> mkhtest(ad, "ad3", "-") adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr='-') #> ad = ad.test(residuals(fm)**2) #> mkhtest(ad, "ad3", "-") adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-') #> ad = ad.test(residuals(fm)[1:20]) #> mkhtest(ad, "ad3", "-") adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr='-') ad1 = smsdia.normal_ad(res.resid) compare_t_est(ad1, adr1, decimal=(11, 13)) ad2 = smsdia.normal_ad(res.resid**2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(res.resid[:20]) compare_t_est(ad3, adr3, decimal=(11, 12))
def compute(cls, widget): """ Anderson-Darling test for one sample :param widget: :return: p-value for Anderson-Darling test """ if isinstance( ande.normal_ad(np.array([a[0] for a in widget.column]))[1], float): return ande.normal_ad(np.array([a[0] for a in widget.column]))[1] else: return ande.normal_ad(np.array([a[0] for a in widget.column]))[1][0]
def test_normality(self): res = self.res #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test #> lt = lillie.test(residuals(fm)) #> mkhtest(lt, "lillifors", "-") lillifors1 = dict(statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)**2) #> mkhtest(lt, "lillifors", "-") lillifors2 = dict(statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)[1:20]) #> mkhtest(lt, "lillifors", "-") lillifors3 = dict(statistic=0.1333956004203103, pvalue=0.4618672180799566, parameters=(), distr='-') lf1 = smsdia.lillifors(res.resid) lf2 = smsdia.lillifors(res.resid**2) lf3 = smsdia.lillifors(res.resid[:20]) compare_t_est(lf1, lillifors1, decimal=(15, 14)) compare_t_est(lf2, lillifors2, decimal=(15, 15)) #pvalue very small assert_approx_equal(lf2[1], lillifors2['pvalue'], significant=10) compare_t_est(lf3, lillifors3, decimal=(15, 1)) #R uses different approximation for pvalue in last case #> ad = ad.test(residuals(fm)) #> mkhtest(ad, "ad3", "-") adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr='-') #> ad = ad.test(residuals(fm)**2) #> mkhtest(ad, "ad3", "-") adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-') #> ad = ad.test(residuals(fm)[1:20]) #> mkhtest(ad, "ad3", "-") adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr='-') ad1 = smsdia.normal_ad(res.resid) compare_t_est(ad1, adr1, decimal=(12, 15)) ad2 = smsdia.normal_ad(res.resid**2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(res.resid[:20]) compare_t_est(ad3, adr3, decimal=(13, 13))
def normal_errors_assumption(df, p_value_thresh=0.05): # Calculating residuals for the Anderson-Darling test df_results = df print('Using the Anderson-Darling test for normal distribution') # Performing the test on the residuals p_value = normal_ad(df_results['residuals'])[1] print('p-value from the test - below 0.05 generally means non-normal:', p_value) # Reporting the normality of the residuals if p_value < p_value_thresh: print('Residuals are not normally distributed') else: print('Residuals are normally distributed') # Plotting the residuals distribution plt.subplots(figsize=(12, 6)) plt.title('Distribution of Residuals') sns.distplot(df_results['residuals']) plt.show() print() if p_value > p_value_thresh: print('Assumption satisfied') else: print('Assumption not satisfied') print('Confidence intervals will likely be affected')
def multivariate_normal_assumption(p_value_thresh=0.05): ''' Normality: Assumes that the predictors have normal distributions. If they are not normal, a non-linear transformation like a log transformation or box-cox transformation can be performed on the non-normal variable. ''' from statsmodels.stats.diagnostic import normal_ad print('\n=======================================================================================') print('Assumption 2: All variables are multivariate normal') print('Using the Anderson-Darling test for normal distribution') print('p-values from the test - below 0.05 generally means normality:') print() non_normal_variables = 0 # Performing the Anderson-Darling test on each variable to test for normality for feature in range(features.shape[1]): p_value = normal_ad(features[:, feature])[1] # Adding to total count of non-normality if p-value exceeds threshold if p_value > p_value_thresh: non_normal_variables += 1 # Printing p-values from the test print('{0}: {1}'.format(feature_names[feature], p_value)) print('\n{0} non-normal variables'.format(non_normal_variables)) print() if non_normal_variables == 0: print('Assumption satisfied') else: print('Assumption not satisfied')
def normal_errors_assumption(dataframe, color, p_value_thresh=0.05): residuals = dataframe["residuals"] print("Using the Anderson-Darling test for normal distribution") p_value = normal_ad(residuals)[1] print("p-value from the test - below 0.05 generally means non-normal:", p_value) if p_value < p_value_thresh: print("Residuals are not normally distributed") else: print("Residuals are normally distributed") # Plotting the residuals distribution sns.distplot(dataframe["residuals"], color=color) plt.title(f"Normal distribution of residuals") plt.show() print() if p_value > p_value_thresh: print("Assumption satisfied") else: print("Assumption not satisfied") print() print("Confidence intervals will likely be affected") print("Try performing nonlinear transformations on variables")
def compare_results(name1, name2, r1, r2): print("---------------------") print(f"{name1},{name2}> Comparison statistics") print(normal_ad(np.array(r1))) print(normal_ad(np.array(r2))) d1 = DescrStatsW(r1) print(d1.get_compare(r2).summary(use_t=True, usevar='unequal')) print(f"{name1},{name2}>", ranksums(r1, r2)) f1 = frame_of(r1, name1) f2 = frame_of(r2, name2) frame = pd.concat([f1, f2]) plt.figure() sns.boxplot(data=frame, x="source", y="time") plt.savefig(f"output/fig{name1}x{name2}.png")
def normality(self): """ First we plot the residuals, then we test the normality using the Anderson-Darling test. p-value should be > 0.05 """ plt.subplots(figsize=(12, 6)) plt.title('Distribution of Residuals') sns.distplot(self.results['Residuals']) plt.show() p_value = normal_ad(self.results['Residuals'])[1] # Second value is the p-value if p_value < 0.05: return "Assumption not met", p_value return "Assumption met", p_value
def normal_errors_assumption(model, features, label, p_value_thresh=0.05): """ Normality: Assumes that the error terms are normally distributed. If they are not, nonlinear transformations of variables may solve this. This assumption being violated primarily causes issues with the confidence intervals """ from statsmodels.stats.diagnostic import normal_ad print('Assumption 2: The error terms are normally distributed', '\n') # Calculating residuals for the Anderson-Darling test """ # ACA PODRIA USAR OTROS TEST TAMBIEN COMO POR EJEMPLO D Agostino s K^2 Test o Shapiro Test. # AUNQUE SEGUN ESTO "https://stackoverflow.com/questions/7781798/seeing-if-data-is-normally-distributed-in-r/7788452#7788452" # AL NO RECHAZAR HO, NO INDICA NECESARIAMENTE NORMALIDAD EN LA DISTRIB DE LOS DATOS, POR ENDE DEBERIA # BUSCAR UN TEST DONDE H1 SEA LA HIPOTESIS DE QUE LA DISTRIB DE LOS DATOS SON NORMALES """ df_results = calculate_residuals(model, features, label) print('Using the Anderson-Darling test for normal distribution') # Performing the test on the residuals p_value = normal_ad(df_results['Residuals'])[1] print('p-value from the test - below 0.05 generally means non-normal:', p_value) # Reporting the normality of the residuals if p_value < p_value_thresh: print('Residuals are not normally distributed') else: print('Residuals are normally distributed') # Plotting the residuals distribution plt.subplots(figsize=(12, 6)) plt.title('Distribution of Residuals') sns.distplot(df_results['Residuals']) plt.show() print() if p_value > p_value_thresh: print('Assumption satisfied') else: print('Assumption not satisfied') print() print('Confidence intervals will likely be affected') print('Try performing nonlinear transformations on variables')
def myResiduals(y,pred,name): res = pd.Series(y.flatten()-pred) print('Anderson-Darling Normal: ',normal_ad(res)[1],'\n\n') # if < 0.05 -> bad fig = plt.figure() plt.hist(res) plt.title(name+' Residuals') fig.savefig(my_path+'/Figures/'+name+'_ResidualHistogram'+'.png') plt.close() #plt.show() fig = plt.figure() plt.scatter(pred,res, color='green', s=50, alpha=.6) plt.hlines(y=0, xmin=min(pred), xmax=max(pred), color='black') plt.ylabel(name+' Residuals') plt.xlabel(name+' Prediction') plt.title ('Residuals vs Preditions ('+name+')') fig.savefig(my_path+'/Figures/'+name+'_ResidualGraph'+'.png') plt.close()
def normal_errors_assumption(p_value_thresh=0.05): """ Normality: Assumes that the error terms are normally distributed. If they are not, nonlinear transformations of variables may solve this. This assumption being violated primarily causes issues with the confidence intervals """ from statsmodels.stats.diagnostic import normal_ad print( '\n=======================================================================================' ) print('Assumption 2: The error terms are normally distributed') print() print('Using the Anderson-Darling test for normal distribution') # Performing the test on the residuals p_value = normal_ad(df_results['Residuals'])[1] print('p-value from the test - below 0.05 generally means non-normal:', p_value) # Reporting the normality of the residuals if p_value < p_value_thresh: print('Residuals are not normally distributed') else: print('Residuals are normally distributed') # Plotting the residuals distribution plt.subplots(figsize=(12, 6)) plt.title('Distribution of Residuals') sns.distplot(df_results['Residuals']) plt.show() print() if p_value > p_value_thresh: print('Assumption satisfied') else: print('Assumption not satisfied') print() print('Confidence intervals will likely be affected') print('Try performing nonlinear transformations on variables')
def normal_errors_assumption(model, features, label, p_value_thresh=0.05): """ Normality: Assumes that the error terms are normally distributed. If they are not, nonlinear transformations of variables may solve this. This assumption being violated primarily causes issues with the confidence intervals """ print("Assumption 2: The error terms are normally distributed", "\n") # Calculating residuals for the Anderson-Darling test df_results = calculate_residuals(model, features, label) print("Using the Anderson-Darling test for normal distribution") # Performing the test on the residuals p_value = normal_ad(df_results["Residuals"])[1] print("p-value from the test - below 0.05 generally means non-normal:", p_value) # Reporting the normality of the residuals if p_value < p_value_thresh: print("Residuals are not normally distributed") else: print("Residuals are normally distributed") # Plotting the residuals distribution plt.subplots(figsize=(12, 6)) plt.title("Distribution of Residuals") sns.distplot(df_results["Residuals"]) plt.show() print() if p_value > p_value_thresh: print("Assumption satisfied") else: print("Assumption not satisfied") print() print("Confidence intervals will likely be affected") print("Try performing nonlinear transformations on variables")
def normality_resid(self, p_value_thresh=0.05): """ Normality: Assumes that the error terms are normally distributed. If they are not, nonlinear transformations of variables may solve this. This assumption being violated primarily causes issues with the confidence intervals """ from statsmodels.stats.diagnostic import normal_ad from scipy.stats import probplot import pylab import matplotlib.pyplot as plt import seaborn as sns from numpy import quantile, logical_or sns.set() if type(self.model) == str: self.fit_model() print( '\n=======================================================================================' ) print('Assumption 5: The error terms are kinda normally distributed') print() print('Using the Anderson-Darling test for normal distribution') # Performing the test on the residuals p_value = normal_ad(self.resid)[1] print('p-value from the test - below 0.05 generally means non-normal:', p_value) # Reporting the normality of the residuals if p_value < p_value_thresh: print('Residuals are not normally distributed') else: print('Residuals are normally distributed') # Plotting the residuals distribution plt.subplots(figsize=(12, 6)) plt.title('Distribution of Residuals') sns.distplot(self.resid) plt.show() print() if p_value > p_value_thresh: print('Assumption satisfied') self.results['Satisfied'].append('Normality') else: print('Assumption not satisfied') self.results['Violated'].append('Normality') print() print('Confidence intervals will likely be affected') print('Try performing nonlinear transformations on variables') print('Building a probability plot') quantiles = probplot(self.resid, dist='norm', plot=pylab) plt.show() qqq = (quantiles[0][1] - quantiles[0][1].mean() ) / quantiles[0][1].std() - quantiles[0][0] q75 = quantile(qqq, 0.75) q25 = quantile(qqq, 0.25) outliers_share = (logical_or(qqq > q75 + (q75 - q25) * 1.7, qqq < q25 - (q75 - q25) * 1.7).sum() / qqq.shape[0]).round(3) if outliers_share < 0.005: print('Assumption can be considered as satisfied.') self.results['Satisfied'].append('Sub-Normality') elif outliers_share < 0.05: self.results['Potentially'].append('Sub-Normality') print( f'\nIn your dataset you quite fat tails. You have {outliers_share} potential outliers ({logical_or(qqq>q75+(q75-q25)*1.7, qqq<q25-(q75-q25)*1.7).sum()} rows)' ) else: print( f'\nIn fact outliers are super significant. Probably it is better to split your dataset into 2 different ones.' ) self.results['Violated'].append('Sub-Normality')
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [ 5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100 ] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [ [], [], [], [], [], [], [] ] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [ [], [], [], [], [], [], [] ] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = [ 'EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA' ] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir + 'data/' + m): if name in GoodNames: pass else: continue path = mydir + 'data/' + m + '/' + name + '/' + name + '-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines + 1), SampSize, replace=True) path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N / S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP) * float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct += 1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols( 'Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols( 'Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols( 'Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols( 'Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow( RarityResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow( RichnessResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow( DomResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow( EvenResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog ) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog ) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog ) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog ) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey( RarityResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( RichnessResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( DomResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( EvenResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:', SampSize, 'iteration:', iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean( sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append( np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean( sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append( np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append( np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append( np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append( np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append( np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0, 1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST, RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST, RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST, RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST, RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir + 'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches="tight") #plt.close() #plt.show() return
def test(self) -> AndersonDarlingTestResult: df_results = self.__calculate_residuals() statistics, p_value = normal_ad(df_results['Residuals']) return AndersonDarlingTestResult(p_value, statistics)
def find_group_DW(tree, dist_matrix, mes=0, seed_max=0, l_significance=0.1): ''' For more details, see the Kim et al. 2008. tree : Tree structure returned from Pycluster module. dist_matrix : Distance matrix (= 1. - correlation matrix) mes = 0 : Total number of measurement of each light curve. seed_max = 0 : To get more tighter seed. 1 ~ 10 are good values. '10' gets more tighter clusters than '1'. return : List of clusters. ''' #r.library('nortest') ############################################################################################ clusters = [] #print tree, len(tree) density_list = [] for i in range(len(dist_matrix) - 1): for j in range(i + 1, len(dist_matrix)): density_list.append(dist_matrix[i][j]) density_list_clip = sigma_clipping(density_list, sigma=3.) overall_density = (max(density_list_clip) - min(density_list_clip)) / len(dist_matrix) #print overall_density, mean(density_list_clip), std(density_list_clip) #get highly correlated pair of elements. initial_seed = [] for i in range(len(tree)): #both left and right element has to be star. not a link to other cluster. if tree[i].left >= 0 and tree[i].right >= 0: #to get more tight elements. if dist_matrix[tree[i].left][tree[i].right] <= median(density_list_clip) / seed_max: if mes == 0: initial_seed.append(i) elif dist_matrix[tree[i].left][tree[i].right] <= (1. - 3. / math.sqrt(mes)): initial_seed.append(i) #print initial_seed #start from highly correlated initial pair. for i in initial_seed: #print tree[i] current_node = i while current_node < len(tree) - 1: cluster_1 = [] cluster_2 = [] #find base cluster --> cluster_1 simplify_group(find_group_with_node_index(tree, current_node), cluster_1) #find cluster which will be merged --> cluster_2 dummy = find_one_side_group(tree, (current_node + 1) * -1) current_node = dummy[0] simplify_group(dummy[1], cluster_2) #check the density changes with overall density #initial density d_1 = [] for ele_i in range(len(cluster_1) - 1): for ele_j in range(ele_i + 1, len(cluster_1)): if ele_i != ele_j: d_1.append(dist_matrix[cluster_1[ele_i]][cluster_1[ele_j]]) #density after merged d_merge = [] cluster_3 = hstack([cluster_1, cluster_2]) for ele_i in range(len(cluster_3) - 1): for ele_j in range(ele_i + 1, len(cluster_3)): if ele_i != ele_j: d_merge.append(dist_matrix[cluster_3[ele_i]][cluster_3[ele_j]]) d_1 = array(d_1) d_merge = array(d_merge) if len(d_merge) < 8: continue else: #the resulting clusters are almost identical. not use anymore. #d_merge = array(d_merge) #d_merge = .5 * log((1. + d_merge) / (1. - d_merge)) #ad = r.ad_test(d_merge) ############################################################################################ ad = normal_ad(d_merge) ad_p = ad[1] p_value = ad_p #check the level of significance #if it's out of normality, the previous cluster is the final cluster. if p_value < l_significance: #becausd AD test needs at least 8 elements. if len(cluster_1) >= 5: #print cluster_1 clusters.append(cluster_1) break #it's still gaussian, but if there comes outliers into clusters, stop it. #the resulting clusters are almost identical. not use anymore. #elif len(d_1[where(d_1 > mean(density_list_clip))]) > 0: # if len(cluster_1) >= 5: # clusters.append(cluster_1) # break return clusters
def test_normal_errors_distribution(model_path, data_path): from statsmodels.stats.diagnostic import normal_ad df_results = calculate_residuals(model_path, data_path) p_value = normal_ad(df_results.Residuals)[1] assert p_value < 0.05
print("----------------------------------------------") col_count = self.data.dropna().value_counts() col_count_percent = self.data.dropna() \ .value_counts(normalize=True) \ .apply(lambda x: str(round(x * 100, 2)) + '%') print(pd.concat([col_count, col_count_percent], axis=1)) except ValueError as err: print('An error occurred while performing range analysis: ', err) def _distribution_analysis(self): """ output : boolean Outputs whether the data follows a normal distribution or not and also plots the histogram and Normal Probability plot""" print("\nDistribution Analysis") print("----------------------------------------------") try: ad_test_statistic, pvalue = normal_ad(self.data[~np.isnan(self.data)]) print('Anderson-Darling Test Coefficient: ', ad_test_statistic) if pvalue > 0.05: print('Normal Distribution: ', True) elif pvalue < 0.05: print('Normal Distribution: ', False) else: print('We need to collect more data to check if the data follows a normal distribution') plt.figure(1) # histogram of the data plt.subplot(2, 1, 1) plt.hist(self.data[~np.isnan(self.data)], bins=30) plt.title('Histogram') plt.xlabel('Values') plt.ylabel('Frequency') # Normal Probability Plot
data1 = np.loadtxt('Data1.txt') data2 = np.loadtxt('Data2.txt') #Calculate bandwidth with Cross Validation Least Seuqares dens1 = KDEMultivariate(data=[data1], var_type='c', bw='cv_ls') dens2 = KDEMultivariate(data=[data2], var_type='c', bw='cv_ls') #Calculate bandwidth with Silverman's rule of thumb bw1 = np.std(data1)*(4./(3.*len(data1)))**(1./5.) bw2 = np.std(data2)*(4./(3.*len(data2)))**(1./5.) #Analyzing Data 1: KDE, Parent distribution, std and mean x_grid1 = np.linspace(0,70,1000) pdf1 = dens1.pdf(x_grid1) mean1, std1 = np.mean(data1),np.std(data1) x1, y1 = gauss(std1,mean1) p1 = normal_ad(data1)[1] mean_kde1, std_kde1 = np.mean(pdf1),np.std(pdf1) #Analyzing Data 2: KDE, Parent distribution, std and mean x_grid2 = np.linspace(0,70,1000) pdf2 = dens2.pdf(x_grid2) mean2, std2 = np.mean(data2),np.std(data2) x2,y2 = lognormal(0.2,1.0) p2 = ks_2samp(y2,data2)[1] mean_kde2, std_kde2 = np.mean(pdf2),np.std(pdf2) #Plot the histograms, the parent distributions and the KDEs plt.ion() plt.clf() fig = plt.figure(00,figsize=(15,5)) ax1 = fig.add_subplot(121)
def gauss(sig=1, x0=0): x = np.linspace(x0 - 10 * sig, x0 + 10 * sig, 1000) y = 1.0 / (np.sqrt(2 * np.pi) * sig) * np.exp(-(x - x0)**2 / (2 * sig**2)) return x, y # Histogram of CO2 Emissions plt.figure(2) hist = plt.hist(data_2015_simple.CO2_emissions, bins=50, density=True) x_gauss, y_gauss = gauss(sig=std, x0=mean) plt.figure(3) plt.plot(x_gauss, y_gauss, 'r--') #plt.savefig('Attempted_Gaussian_Fit') ad, p = normal_ad(data_2015_simple.CO2_emissions) # A-D test print(p) # Basically 0 # KDE Estimation kde = gaussian_kde(data_2015_simple.CO2_emissions) xvals = np.linspace(0, 50, 10000) plt.figure(4) plt.plot(xvals, kde.pdf(xvals), 'r--') plt.xlim(0, 50) plt.hist(data_2015_simple.CO2_emissions, bins=np.arange(50), density=True) plt.ylabel('Frequency', fontsize=12) plt.xlabel('CO2 Emissions Per Capita', fontsize=12) plt.title('KDE Plot') #plt.savefig('CO2_Emissions_KDE.png')
ds.nonparametric_summary(df['y']) # alphap=0.33 betap=0.33 by default ds.nonparametric_summary(df['y'], alphap=0, betap=0) # Minitab df.min() df.max() df.mean() df.std() df.var() df.skew() df.kurt() df.count() adresult = anderson(df['y'], dist='norm') adresult.statistic smd.normal_ad(df['y']) sm.norm.interval(0.95, loc=np.mean(df['y']), scale=sm.sem(df['y'])) sm.t.interval(0.95, len(df['y']-1), loc=np.mean(df['y']), scale=sm.sem(df['y']))
fig = sm.graphics.tsa.plot_acf(sunspot_data.values.squeeze(), lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(sunspot_data, lags=40, ax=ax2) arma_mod20 = sm.tsa.ARMA(sunspot_data, (2,0)).fit() arma_mod30 = sm.tsa.ARMA(sunspot_data, (3,0)).fit() stattools.durbin_watson(arma_mod30.resid.values) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) ax = arma_mod30.resid.plot(ax=ax) resid = arma_mod30.resid diag.normal_ad(resid) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) fig = qqplot(resid, line='q', ax=ax, fit=True) fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2) r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True) data = np.c_[range(1,41), r[1:], q, p] table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"]) print(table.set_index('lag'))
visualizer = ResidualsPlot(regressor, hist=False) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show() # Observed vs Predicted sns.residplot(y_test, y_pred) np.mean(y_test-y_pred) # Statistic method to check normality (Using the Anderson-Darling test for normal distribution) from statsmodels.stats.diagnostic import normal_ad p_value_thresh = .05 # Performing the test on the residuals p_value = normal_ad(y_test-y_pred)[1] print('p-value from the test - below 0.05 generally means non-normal:', p_value) # Reporting the normality of the residuals if p_value < p_value_thresh: print('Residuals are not normally distributed') else: print('Residuals are normally distributed') # Plotting the residuals distribution with Histogram to see the how the residuals are spread plt.subplots(figsize=(12, 6)) plt.title('Distribution of Residuals') sns.distplot(y_test-y_pred) plt.show()
def validate(self, all=False): ''' validate Method that validates the main assumptions of linear regressions Thanks to https://jeffmacaluso.github.io/post/LinearRegressionAssumptions/ ''' p_value_thresh = 0.05 # (5% double sided) self.err = self.y_pred - self.y display(HTML("<h1>Linear Regression Validation Report</h1>")) display(HTML("<h3>Metrics</h3>")) display(HTML( f"<b>R2:</b> {r2_score(self.y, self.y_pred)} <br> \ <b>RMSE:</b> {mean_squared_error(self.y, self.y_pred, squared=False)}" )) display(HTML("<h3>Linear Assumption</h3>")) # Predicted vs Real fig = plt.figure(figsize=(4, 3)) df_ = pd.DataFrame(self.y.rename("y"), index=self.y.index).join(pd.DataFrame(self.y_pred.rename("y_pred"), index=self.y_pred.index)) sns.scatterplot(x='y', y='y_pred', data=df_) # Plotting the diagonal line min_ = max(self.y.min(),self.y_pred.min()) max_ = max(self.y.max(),self.y_pred.max()) plt.plot((min_, max_), (min_, max_), color='darkorange', linestyle='--') plt.title('Real vs. Predicted') plt.show() display(HTML("<h3>Normal Assumption of errors - Anderson-Darling</h3>")) # Using the Anderson-Darling test for normal distribution p_value = normal_ad(self.err)[1] if p_value <= p_value_thresh: display(HTML(f'<span style="color:red;font-weight:bold">Normal assumption not satisfied (p_value: {p_value})</span> --> confidence intervals will likely be affected. Try to perform nonliear transformations on variables. Info on QQPlot: <a href="https://seankross.com/2016/02/29/A-Q-Q-Plot-Dissection-Kit.html">here</a>.')) # Normal error distribution fig, axes = plt.subplots(1, 2, figsize=(8,3)) plt.tight_layout(pad=0.3) sns.distplot( self.y, fit = norm, ax = axes[0], ) axes[0].set_title(f"Error normality (skw: {round(self.err.skew(),2)})") #qqplot sm.qqplot(self.err, line ='q', ax=axes[1]) axes[1].set_title(f"qqplot") plt.show() display(HTML("<h3>Assumption of non-autocorrelation</h3>")) # Assumes that there is no autocorrelation in the residuals. If there is # autocorrelation, then there is a pattern that is not explained due to # the current value being dependent on the previous value. # This may be resolved by adding a lag variable of either the dependent # variable or some of the predictors. # Durbin-Watson Test # Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data # 0 to 2< is positive autocorrelation # >2 to 4 is negative autocorrelation durbinWatson = durbin_watson(self.err) if durbinWatson < 1.5: display(HTML('<span style="color:red;font-weight:bold">Signs of positive autocorrelation</span>')) elif durbinWatson > 2.5: display(HTML('<span style="color:red;font-weight:bold">Signs of negative autocorrelation</span>')) else: display(HTML('No signs of autocorrelation')) display(HTML("<h3>Assumption of Random error vs predictors - Homoscedasticity</h3>")) # If heteroscedasticity: Variables with high ranges may be the cause # For the dependent variable, use rates or per capita ratios instead of raw variables. That may change the project. # fig = plt.figure(figsize=(4, 3)) sns.scatterplot(x=self.y_pred, y=self.err) plt.title('Error vs. Predicted') plt.show() if all: # Random errors vs predictors num_pred = len(self.X.columns) fig_columns = 3 fig, axes = plt.subplots(math.ceil(num_pred/fig_columns), fig_columns, figsize=(4*fig_columns, math.ceil(num_pred/fig_columns)*3)) #plt.tight_layout(pad=1) row = 0 col = 0 for i in range(num_pred): axes[row, col].axhline(y=0, linewidth=4, color='r') axes[row, col].set_title(f"Err vs {self.X.columns[i]}") sns.scatterplot( x = self.X.iloc[:,i], y = self.err, ax = axes[row, col] ) if ((i+1)%fig_columns == 0): col = 0 else: col+=1 if col == 0: row += 1 plt.show() # summarize feature importance df_imp_ = pd.DataFrame(self.lrmodel.coef_, index=self.X.columns, columns=["importance"]) df_imp_["importance_abs"] = df_imp_["importance"].abs() df_imp_.sort_values(by="importance_abs", ascending=False, inplace=True) df_imp_.drop("importance_abs", axis=1, inplace=True) s_imp_ = df_imp_["importance"] # plot feature importance fig = plt.figure(figsize=(15,4)) plt.xticks(rotation=90) sns.barplot(x=s_imp_.index, y=s_imp_.values) plt.show()
plt.show() #T test mvp = np.array([-0.038736455,-0.136159028,-0.186696644,-0.069105119,-0.225828899,-0.377797055,-0.151966429,0.138888407,0.177621218,-0.149457563,-0.164333338,-0.069042195,]) mE= np.array([-0.2,-0.274,-0.12,-0.22,-0.33,-0.57,-0.34,0.13,0.05,-0.26,-0.03,-0.12,]) """geting means and sample variances""" svp = np.std(mvp,ddof=1) sE = np.std(mE,ddof=1) meanvp = np.mean(mvp) meanE = np.mean(mE) """Test for normality""" print('Anderson Darling Test for normality') print(diagnostic.normal_ad(mvp)) print(diagnostic.normal_ad(mE)) print(stats.ttest_1samp(mvp, 0)) print(stats.ttest_1samp(mE, 0)) #print(stats.anderson(pvMO_Hres,dist='norm')) """Test for Different Means two-tailed t-test""" print('Two-tailed T-test for different means') two_sample_diff_var = stats.ttest_ind(mvp, mE, equal_var=False) print(two_sample_diff_var) Pairs = ['O/OH','O/OH$_{2}$','O/O$_{2}$','O/OOH','N/NH','N/NH$_{2}$','NH/NH$_{2}$','C/CH','C/CH$_{2}$','C/CH$_{3}$','C/CO','O/CH$_{3}$'] me = [0.43,0.096,0.53,0.74,0.65,0.29,0.47,0.83,0.47,0.24,0.57,0.66] mv = [0.63,0.37,0.65,0.96,0.98,0.86,0.81,0.7,0.42,0.5,0.6,0.78] mvp = [0.59,0.23,0.46,0.89,0.75,0.48,0.66,0.84,0.60,0.35,0.44,0.71]
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [[], [], [], [], [], [], []] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [[], [], [], [], [], [], []] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir +'data/'+m): if name in GoodNames: pass else: continue path = mydir+'data/'+m+'/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines+1), SampSize, replace=True) path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N/S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP)*float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct+=1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols('Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols('Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols('Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols('Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow(RarityResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow(RichnessResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow(DomResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow(EvenResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey(RarityResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(RichnessResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(DomResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(EvenResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:',SampSize, 'iteration:',iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean(sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append(np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean(sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append(np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append(np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append(np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append(np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append(np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes)-1,max(SampSizes)+10) plt.ylim(0,1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST,RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST,RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST,RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST,RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir+'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches = "tight") #plt.close() #plt.show() return