def test_normality(self): res = self.res #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test #> lt = lillie.test(residuals(fm)) #> mkhtest(lt, "lillifors", "-") lillifors1 = dict(statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)**2) #> mkhtest(lt, "lillifors", "-") lillifors2 = dict(statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)[1:20]) #> mkhtest(lt, "lillifors", "-") lillifors3 = dict(statistic=0.1333956004203103, pvalue=0.4618672180799566, parameters=(), distr='-') lf1 = smsdia.lillifors(res.resid) lf2 = smsdia.lillifors(res.resid**2) lf3 = smsdia.lillifors(res.resid[:20]) compare_t_est(lf1, lillifors1, decimal=(15, 14)) compare_t_est(lf2, lillifors2, decimal=(15, 15)) #pvalue very small assert_approx_equal(lf2[1], lillifors2['pvalue'], significant=10) compare_t_est(lf3, lillifors3, decimal=(15, 1)) #R uses different approximation for pvalue in last case #> ad = ad.test(residuals(fm)) #> mkhtest(ad, "ad3", "-") adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr='-') #> ad = ad.test(residuals(fm)**2) #> mkhtest(ad, "ad3", "-") adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-') #> ad = ad.test(residuals(fm)[1:20]) #> mkhtest(ad, "ad3", "-") adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr='-') ad1 = smsdia.normal_ad(res.resid) compare_t_est(ad1, adr1, decimal=(11, 13)) ad2 = smsdia.normal_ad(res.resid**2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(res.resid[:20]) compare_t_est(ad3, adr3, decimal=(11, 12))
def check_normality(): '''Check if the distribution is normal.''' # Set the parameters numData = 1000 myMean = 0 mySD = 3 # To get reproducable values, I provide a seed value np.random.seed(1234) # Generate and show random data data = stats.norm.rvs(myMean, mySD, size=numData) fewData = data[:100] plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(fewData) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lillifors(data) _, pFewVals['Lilliefors'] = lillifors(fewData) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = stats.kstest( (data - np.mean(data)) / np.std(data, ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest( (fewData - np.mean(fewData)) / np.std(fewData, ddof=1), 'norm') print('p-values for all {0} data points: ----------------'.format( len(data))) print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals['Kolmogorov-Smirnov']
def check_normality(): '''Check if the distribution is normal.''' # Set the parameters numData = 1000 myMean = 0 mySD = 3 # To get reproducable values, I provide a seed value np.random.seed(1234) # Generate and show random data data = stats.norm.rvs(myMean, mySD, size=numData) fewData = data[:100] plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(fewData) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lillifors(data) _, pFewVals['Lilliefors'] = lillifors(fewData) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm') print('p-values for all {0} data points: ----------------'.format(len(data))) print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals['Kolmogorov-Smirnov']
def test_normality(self): res = self.res #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test #> lt = lillie.test(residuals(fm)) #> mkhtest(lt, "lillifors", "-") lillifors1 = dict(statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)**2) #> mkhtest(lt, "lillifors", "-") lillifors2 = dict(statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr='-') #> lt = lillie.test(residuals(fm)[1:20]) #> mkhtest(lt, "lillifors", "-") lillifors3 = dict(statistic=0.1333956004203103, pvalue=0.4618672180799566, parameters=(), distr='-') lf1 = smsdia.lillifors(res.resid) lf2 = smsdia.lillifors(res.resid**2) lf3 = smsdia.lillifors(res.resid[:20]) compare_t_est(lf1, lillifors1, decimal=(15, 14)) compare_t_est(lf2, lillifors2, decimal=(15, 15)) #pvalue very small assert_approx_equal(lf2[1], lillifors2['pvalue'], significant=10) compare_t_est(lf3, lillifors3, decimal=(15, 1)) #R uses different approximation for pvalue in last case #> ad = ad.test(residuals(fm)) #> mkhtest(ad, "ad3", "-") adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr='-') #> ad = ad.test(residuals(fm)**2) #> mkhtest(ad, "ad3", "-") adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-') #> ad = ad.test(residuals(fm)[1:20]) #> mkhtest(ad, "ad3", "-") adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr='-') ad1 = smsdia.normal_ad(res.resid) compare_t_est(ad1, adr1, decimal=(12, 15)) ad2 = smsdia.normal_ad(res.resid**2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(res.resid[:20]) compare_t_est(ad3, adr3, decimal=(13, 13))
def check_normality(testData, printflag=False): # 20<样本数<50用normal test算法检验正态分布性 if 20 < len(testData) < 50: if printflag: print('use normal test') p_value = stats.normaltest(testData) return [p_value[0], p_value[1]] # 样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(testData) < 50: if printflag: print('use shapiro test') p_value = stats.shapiro(testData) return [p_value[0], p_value[1]] if 300 >= len(testData) >= 50: if printflag: print('use lilliefors test') p_value = lillifors(testData) return [p_value[0], p_value[1]] if len(testData) > 300: if printflag: print('use kstest test') p_value = stats.kstest(testData, 'norm') return [p_value[0], p_value[1]]
def normalityTest(data, pval_cutoff): """ This function accepts a series of number and returns a 3-element tuple containing: A boolean Indicator(True for normal), A p-value ( < pval_cutoff ==> not normal ), Test Name """ size = len(data) ## sample size < 50, use Shapiro-Wilk algorithm if size <50: p_value= stats.shapiro(data)[1] if p_value<0.05: return (False, p_value, "scipy.stats.shapiro") else: return (True, p_value, "scipy.stats.shapiro") ## 50<= sample size <=300, use Lillifors if 300>=size >=50: p_value= lillifors(data)[1] if p_value<0.05: return (False, p_value,"statsmodels.stats.diagnostic.lillifors") else: return (True, p_value,"statsmodels.stats.diagnostic.lillifors") ## sample size > 300, use Kolmogorov-Smirnov test if size >300: p_value= stats.kstest(data,'norm')[1] if p_value<0.05: return (False, p_value,"Kolmogorov-Smirnov") else: return (True, p_value,"Kolmogorov-Smirnov")
def check_normality(testData): # 20<样本数<50用normal test算法检验正态分布性 if 20 < len(testData) < 50: p_value = stats.normaltest(testData)[1] if p_value < 0.05: return False else: return True # 样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(testData) < 50: p_value = stats.shapiro(testData)[1] if p_value < 0.05: return False else: return True if 300 >= len(testData) >= 50: p_value = lillifors(testData)[1] if p_value < 0.05: return False else: return True if len(testData) > 300: p_value = stats.kstest(testData, 'norm')[1] if p_value < 0.05: return False else: return True
def check_normality(testData): print("one group normality check begin:") # 20<样本数<50用normal test算法检验正态分布性 if 20 < len(testData) < 50: p_value = stats.normaltest(testData)[1] if p_value < 0.05: print("use normaltest") print("p value:", p_value) print("data are not normal distributed") return False else: print("use normaltest") print("p value:", p_value) print("data are normal distributed") return True # 样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(testData) < 50: p_value = stats.shapiro(testData)[1] if p_value < 0.05: print("use shapiro:") print("p value:", p_value) print("data are not normal distributed") return False else: print("use shapiro:") print("p value:", p_value) print("data are normal distributed") return True if 300 >= len(testData) >= 50: p_value = lillifors(testData)[1] if p_value < 0.05: print("use lillifors:") print("p value:", p_value) print("data are not normal distributed") return False else: print("use lillifors:") print("p value:", p_value) print("data are normal distributed") return True if len(testData) > 300: p_value = stats.kstest(testData, 'norm')[1] if p_value < 0.05: print("use kstest:") print("p value:", p_value) print("data are not normal distributed") return False else: print("use kstest:") print("p value:", p_value) print("data are normal distributed") return True # 测试结束 print("-" * 100)
def check_normality(testData): #样本数大于3小于20用Shapiro-Wilk算法检验正态分布性 if 3 <= len(testData) <= 20: #print('shapiro') d, p_value = stats.shapiro(testData) if p_value < 0.05: #print "use lillifors:" #print "data are not normal distributed" return (d, p_value, 0, 'shapiro') else: #print "use lillifors:" #print "data are normal distributed" return (d, p_value, 1, 'shapiro') #样本数大于20小于50用normaltest算法检验正态分布性 if 20 < len(testData) < 50: #print('normaltest') d, p_value = stats.normaltest(testData) if p_value < 0.05: #print "use lillifors:" #print "data are not normal distributed" return (d, p_value, 0, 'normaltest') else: #print "use lillifors:" #print "data are normal distributed" return (d, p_value, 1, 'normaltest') #样本数大于50小于300用lillifors算法检验正态分布性 if 300 >= len(testData) >= 50: #print('lillifors') d, p_value = lillifors(testData) if p_value < 0.05: #print "use lillifors:" #print "data are not normal distributed" return (d, p_value, 0, 'lillifors') else: #print "use lillifors:" #print "data are normal distributed" return (d, p_value, 1, 'lillifors') #样本数大于300用kstest算法检验正态分布性 if len(testData) > 300: #print('kstest') d, p_value = stats.kstest(testData, 'norm') if p_value < 0.05: #print "use kstest:" #print "data are not normal distributed" return (d, p_value, 0, 'kstest') else: #print "use kstest:" #print "data are normal distributed" return (d, p_value, 1, 'kstest')
def fun_test_norm(data, p_value=0.05): # 20<样本数<50用normal test算法检验正态分布性 if 20 < len(data) < 50: p_value0 = stats.normaltest(data)[1] if p_value0 < p_value: print("use normaltest") print("data are not normal distributed: p={}<{}".format( p_value0, p_value)) return False else: print("use normaltest") print("data are normal distributed") return True # 样本数小于50用Shapiro-Wilk算法检验正态分布性 if len(data) < 50: p_value0 = stats.shapiro(data)[1] print("use shapiro: p_value={}".format(p_value0)) if p_value0 < p_value: print("data are not normal distributed: p={} < {}".format( p_value0, p_value)) return False else: print("data are normal distributed: p={} > {}".format( p_value0, p_value)) return True if 300 >= len(data) >= 50: p_value0 = lillifors(data)[1] print("use lilifors: p_value={}".format(p_value0)) if p_value0 < p_value: print("data are not normal distributed: p={} < {}".format( p_value0, p_value)) return False else: print("data are normal distributed:: p={} > {}".format( p_value0, p_value)) return True if len(data) > 300: p_value0 = stats.kstest(data, 'norm')[1] print("use kstest: p_value={}".format(p_value)) if p_value0 < p_value: print("data are not normal distributed: p={} < {}".format( p_value0, p_value)) return False else: print("data are normal distributed: p={} > {}".format( p_value0, p_value)) return True
def check_normality(testData): #20<sample number<50 normal test if 20 < len(testData) < 50: p_value = stats.normaltest(testData)[1] if p_value < 0.05: print "use normaltest" print "data are not normal distributed" return False else: print "use normaltest" print "data are normal distributed" return True #sample number<50 Shapiro-Wilk if len(testData) < 50: p_value = stats.shapiro(testData)[1] if p_value < 0.05: print "use shapiro:" print "data are not normal distributed" return False else: print "use shapiro:" print "data are normal distributed" return True if 300 >= len(testData) >= 50: p_value = lillifors(testData)[1] if p_value < 0.05: print "use lillifors:" print "data are not normal distributed" return False else: print "use lillifors:" print "data are normal distributed" return True if len(testData) > 300: p_value = stats.kstest(testData, 'norm')[1] if p_value < 0.05: print "use kstest:" print "data are not normal distributed" return False else: print "use kstest:" print "data are normal distributed" return True
def check_normality(testData, plot=False, sampling=False, is_freq=False, add_e=False, verbose=False): # 20<样本数<50用normal test算法检验正态分布性 if is_freq: testData = unstack_freq(testData, add_e=add_e) if sampling: testData = random.sample(testData, sampling) if len(testData) <= 50: p_value = scs.shapiro(testData)[1] if verbose: print("use shapiro:") if 50 < len(testData) <= 300: p_value = lillifors(testData)[1] if verbose: print("use lillifors:") if len(testData) > 300: mu, sigma = scs.norm.fit(testData) norm_fit = scs.norm(loc=mu, scale=sigma) p_value = scs.kstest(testData, norm_fit.cdf)[1] if verbose: print("use kstest:") if plot: norm_fit = scs.norm(loc=np.mean(testData), scale=np.std(testData)) uni_data = np.unique(testData) num_of_bins = min(len(uni_data), 200) sorted_uni_data = np.sort(uni_data) min_diff = scs.mode(np.diff(sorted_uni_data)).mode fitted_dist_x = np.arange(sorted_uni_data[0], sorted_uni_data[-1] + min_diff, min_diff) _, axes = plt.subplots(2, 1) axes[0].hist(testData, num_of_bins) axes[1] = scs.probplot(testData, plot=plt) axes[0].plot(fitted_dist_x, 350 * norm_fit.pdf(fitted_dist_x)) plt.show() return p_value
def _test_normality(self, f, level = 0.01, burn_in = 200): if len(f) <= burn_in + 1: return False return lillifors(f)[1] >= level
la_away_gr_mean = np.mean(la_away_gr) la_away_gr_var = np.var(la_away_gr) #Histogram to visually check if the growth fits a normal distribution. plt.hist(la_away_gr) plt.title( "Histogram of Year on Year Differences in Spending Away from Home in LA") plt.show() print "Annual Growth Rate Average = " + str(la_away_gr_mean) print "Annual Growth Rate Variance = " + str(la_away_gr_var) print "Annual Growth Rate Standard Deviation = " + str(np.sqrt(la_away_gr_var)) #Generate KS-Test Statistic for estimated parameters using statsmodels model la_gr_ksstat = sm.lillifors(la_away_gr)[0] if la_gr_ksstat <= 1.035: #This number is the 0.01 significance level critical value to apply the K-S Test on a normal population with estimated mean and variance print "LA Growth fits Normal Dist." print "End LA" #<<<<End LA>>>> print "Begin SF" #<<<<SF Analysis>>>> #Generate the linear regression parameters for LA based on total spending and away from home spending sfreg_param_total = np.polyfit(years, sf_total, 1) sfreg_param_away = np.polyfit(years, sf_away, 1)
def HTS_Ttest(dataDict,type,printToFile,alpha=0.05): import numpy as np #import scipy.stats as sc import math import matplotlib.pyplot as pyplot import scipy import statsmodels.stats.diagnostic as smStats dataSel = dataDict.dataSelector nGroups = dataSel.nGroups groupKeys = dataSel.getGroupDescriptions() print groupKeys fileKeys = dataDict.dict.keys() nFiles = len(fileKeys) plotColors = dataSel.getPlotColors() statsStruct = {} statsStruct['results'] = [] statsStruct['testType'] = '2 sample t-test' if type == 'crossplate': nDataSets = 1 combinedData = dataDict.dict['combinedData'] if type == 'inplate': nDataSets = nFiles else: print'HTS_ttest: unknown type ''%s''',type print('------------------------------------\n') print('Dependent Variable: %s\n',dataSel.dependentVariable) fig = pyplot.figure() #fig = pyplot.subplot(2,5,5) plots= [0] * nDataSets * nGroups for iData in range(nDataSets): print nDataSets, 'nDataSets' if type == 'crossplate': theDataSet = combinedData if nFiles == 1: dataSource = fileKeys[iData] titleStr = str(dataSource) else: dataSource = 'Multiple Files' titleStr = 'CrossPlate Analysis, Multiple Files' if type == 'inplate': theDataSet = dataDict.dict[fileKeys[iData]] dataSource = fileKeys[iData] titleStr = str(dataSource) cols= nDataSets/2.0 cols = math.ceil(cols) print cols, iData plots[iData] = fig.add_subplot(2,cols,iData) #plots[iData] = pyplot.subplot(2,5,iData) #plts = pyplot.subplot(4,4,iData) # fh = figure('Visible','off') # hold(gca,'on') bh = [0]* nGroups legStrings = []*nGroups for iGrp in range(nGroups): grpKey = groupKeys[iGrp] depData = theDataSet[grpKey]['dependentData'] #depData = depData(~isnan(depData)) if not depData.any() : print'HTS_tTest: %s\nEmpty data for group %s\n',dataSource,grpKey mu = np.mean(depData) stderr = scipy.std(depData)/math.sqrt(len(depData)) #print mu, mu+stderr bh[iGrp] = plots[iData].bar(iGrp,mu,0.75, color = plotColors[iGrp])#'facecolor')#,'blue') # pyplot.plot([iGrp+0.37,iGrp+0.37],[mu,mu+stderr],'k') pyplot.hold('on') legStrings += [str(str(grpKey) + '(n=' + str(len(depData))+')')] statValues = {} statValues['dataSource'] = dataSource for iGrp1 in range(nGroups): for iGrp2 in range( iGrp1+1,nGroups): grp1Key = groupKeys[iGrp1] depData1 = theDataSet[grp1Key]['dependentData'] grp2Key = groupKeys[iGrp2] depData2 = theDataSet[grp2Key]['dependentData'] np1 = smStats.lillifors(depData1) np2 = smStats.lillifors(depData2) if np1 and np2: tmp = {} (tmp['p'],tmp['h']) = scipy.stats.ttest_ind(depData1,depData2) #,alpha) else: tmp = {} (tmp['p'],tmp['h']) = ranksum(depData1,depData2,'alpha',alpha) statsKey = str(grp1Key) + ' vs ' + str(grp2Key) statValues[statsKey] = tmp print statValues #depData1_all[iGrp1,iGrp2]=depData1 #---------- Analysis Complete ------------------------------------ # scribe.legend(gca,'vertical','NorthEastOutside',-1,bh',false,... # legStrings,propargs[:]) # Format the figure titleStr = titleStr.split('\\' )[-1] plots[iData].set_title(titleStr); plots[iData].set_ylabel(dataSel.dependentVariable) #set(gca,'XTickLabelMode','manual') #set(gca,'TickDir','out') plots[iData].set_xticklabels('') plots[iData].set_xticks([]) plots[iData].legend(bh[iGrp],legStrings) pyplot.show() return statsStruct