def test_basic(self): x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46, 4.43,0.21,4.75,0.71,1.52,3.24, 0.93,0.42,4.97,9.53,4.55,0.47,6.66] w,pw = stats.shapiro(x1) assert_almost_equal(w,0.90047299861907959,6) assert_almost_equal(pw,0.042089745402336121,6) x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11, 3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69, 0.08,3.67,2.81,3.49] w,pw = stats.shapiro(x2) assert_almost_equal(w,0.9590270,6) assert_almost_equal(pw,0.52460,3) # Verified against R np.random.seed(12345678) x3 = stats.norm.rvs(loc=5, scale=3, size=100) w, pw = stats.shapiro(x3) assert_almost_equal(w, 0.9772805571556091, decimal=6) assert_almost_equal(pw, 0.08144091814756393, decimal=3) # Extracted from original paper x4 = [0.139, 0.157, 0.175, 0.256, 0.344, 0.413, 0.503, 0.577, 0.614, 0.655, 0.954, 1.392, 1.557, 1.648, 1.690, 1.994, 2.174, 2.206, 3.245, 3.510, 3.571, 4.354, 4.980, 6.084, 8.351] W_expected = 0.83467 p_expected = 0.000914 w, pw = stats.shapiro(x4) assert_almost_equal(w, W_expected, decimal=4) assert_almost_equal(pw, p_expected, decimal=5)
def mleWithSgd(self,x_array,y_array): a,b,theta,loss = random.random(),random.random(),random.random(),2**31 optimal_a,optimal_b,optimal_theta = 0,0,0 for i in xrange(len(x_array)): x,y = x_array[i],y_array[i] a = a - self.learning_rate * (1/(theta*x)*(a*x+b-y)) b = b - self.learning_rate * (1/(theta*(x**2))*(a*x+b-y)) theta = theta - self.learning_rate * (-((y-a*x-b)**2)/((x**2)*(theta**3)) - theta) curr_loss = self.mleLossFunc(x_array,y_array,a,b,theta) if curr_loss<=loss: self.learning_rate*=1.05 optimal_a,optimal_b,optimal_theta = a,b,theta else: self.learning_rate*=0.5 a,b,theta = optimal_a,optimal_b,optimal_theta loss = curr_loss print curr_loss print "Output:" #print a,b,theta,self.learning_rate print optimal_a,optimal_b,optimal_theta,len(x_array) # Evaluation # Perform Shapiro-Wilk test # Which tests the null hypothesis that the data was drawn from a normal distribution. normalized_array = np.array([(y_array[i]-optimal_a*x_array[i]-optimal_b)/(optimal_theta*x_array[i]) for i in range(len(x_array))]) print stats.shapiro(normalized_array) #plt.plot(list(x_array),list(y_array),'ro') #plt.show() return optimal_a,optimal_b,optimal_theta
def test(self, arr1, arr2): p_value = 0 if self.statistics == "auto": # проверяем Левеном на равенство дисперсий. Если равны if stats.levene(arr1, arr2)[1] > 0.05: # Шапир на нормальность выборок. Если нормальные if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05: # p = Student p_value = stats.ttest_ind(arr1, arr2)[1] else: # p = Mann if equal(arr1, arr2): p_value = 1 else: p_value = stats.mannwhitneyu(arr1, arr2)[1] else: p_value = stats.ttest_ind(arr1, arr2, False)[1] elif self.statistics == "student": p_value = stats.ttest_ind(arr1, arr2)[1] elif self.statistics == "welch": p_value = stats.ttest_ind(arr1, arr2, False)[1] elif self.statistics == "mann": if equal(arr1, arr2): p_value = 1 else: p_value = stats.mannwhitneyu(arr1, arr2)[1] return p_value
def mleWithSgdNonlinear(self,x_array,y_array): a,b,theta0,theta1,theta2,loss = random.random(),random.random(),random.random(),random.random(),random.random(),2**31 optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 = 0,0,0,0,0 for i in xrange(len(x_array)): x,y = x_array[i],y_array[i] a = a - self.learning_rate * (x*(a*x+b-y)/(theta0*x**2+theta1*x+theta2) + self.reg_cof*a) b = b - self.learning_rate * ((a*x+b-y)/(theta0*x**2+theta1*x+theta2) + self.reg_cof*b) theta0 = theta0 - self.learning_rate * ((- (x**2) * ((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + x**2/(theta0*(x**2)+theta1*x+theta2) ) + self.reg_cof*theta0) theta1 = theta1 - self.learning_rate * ((-x * ((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + x/(theta0*(x**2)+theta1*x+theta2) ) + self.reg_cof*theta1) theta2 = theta2 - self.learning_rate * (-((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + 1/(theta0*(x**2)+theta1*x+theta2) + self.reg_cof*theta2) curr_loss = self.mleLossNonlinear(x_array,y_array,a,b,theta0,theta1,theta2) if curr_loss<loss: self.learning_rate*=1.05 optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 = a,b,theta0,theta1,theta2 else: self.learning_rate*=0.5 a,b,theta0,theta1,theta2 = optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 loss = curr_loss print curr_loss print "Output:" #print a,b,theta,self.learning_rate print optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 # Evaluation # Perform Shapiro-Wilk test # Which tests the null hypothesis that the data was drawn from a normal distribution. normalized_array = np.array([(y_array[i]-optimal_a*x_array[i]-optimal_b)/(optimal_t0*(x_array[i]**2)+optimal_t1*x_array[i]+optimal_t2) for i in range(len(x_array))]) print stats.shapiro(normalized_array) plt.plot(normalized_array,[1]*len(normalized_array),'ro') plt.show()
def main(): if len(sys.argv) < 4: return 1 _, list_a, list_b, significance = sys.argv[:4] list_a = json.loads(list_a) list_b = json.loads(list_b) significance = float(significance) shapiro_p_value = stats.shapiro(list_a)[1], stats.shapiro(list_b)[1] mann_whitney_p_value = stats.mannwhitneyu(list_a, list_b).pvalue anderson_p_value = stats.anderson_ksamp([list_a, list_b]).significance_level welch_p_value = stats.ttest_ind(list_a, list_b, equal_var=False)[1] results = { 'first_sample': list_a, 'second_sample': list_b, 'shapiro_p_value': shapiro_p_value, 'mann_p_value': mann_whitney_p_value, 'anderson_p_value': anderson_p_value, 'welch_p_value': welch_p_value, } if (results['shapiro_p_value'][0] < significance and results['shapiro_p_value'][1] < significance): results['normal-y'] = True else: results['normal-y'] = False results['significantly_different'] = bool( float(results['mann_p_value']) < float(significance)) print json.dumps(results) return 0
def boxcoxtrans(str,list): s=list w = pd.read_csv(str, usecols=s) f = DataFrame(w) c = f.astype(float) x = c.as_matrix() e = [] for j in np.linspace(-2, 2, num=21): if j != 0: b =(x**j) d=[] c=[] for i in range(0,len(b)): c = b[i] d.append(c[0]) t = stats.shapiro(d) e.append(t[1]) for i in range(0,len(e)): if e[i]==max(e): break t=(-2+0.2*i) if t>=0: t=(-2+0.2*(i+1)) print 'optimal lembda=',t h=((x**t)-1)/t l=[] m=[] for i in range(0,len(h)): l = h[i] m.append(l[0]) print pd.DataFrame(m) k=stats.shapiro(m) print 'shapiro test of trans column',k
def return_test_results(self, arr1, arr2): test_name = "" p_value = 0 t_value = 0 levene = stats.levene(arr1, arr2)[1] if self.statistics == "auto": # проверяем Левеном на равенство дисперсий. Если равны if levene > 0.05: # Шапир на нормальность выборок. Если нормальные if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05: # p = Student test_name = "Student" result = stats.ttest_ind(arr1, arr2) t_value = result[0] p_value = result[1] else: # p = Mann test_name = "Mann" if equal(arr1, arr2): t_value = None p_value = 1 else: result = stats.mannwhitneyu(arr1, arr2) t_value = result[0] p_value = result[1] else: test_name = "Welch" result = stats.ttest_ind(arr1, arr2, False) t_value = result[0] p_value = result[1] elif self.statistics == "student": test_name = "Student" result = stats.ttest_ind(arr1, arr2) t_value = result[0] p_value = result[1] elif self.statistics == "welch": test_name = "Welch" result = stats.ttest_ind(arr1, arr2, False) t_value = result[0] p_value = result[1] elif self.statistics == "mann": test_name = "Mann" if equal(arr1, arr2): t_value = None p_value = 1 else: result = stats.mannwhitneyu(arr1, arr2) t_value = result[0] p_value = result[1] df = len(arr1) + len(arr2) - 2 return [test_name, t_value, p_value, df, levene]
def test_sample_means_and_var_distribution(N, Pis, sample_size, multi, n_test): x_pvalues = [] y_pvalues = [] passed = [] for i in range(n_test): x, y = multinomial_mean_and_var_errors(N, Pis, sample_size, multi) x_pvalue = spstats.shapiro(x)[1] y_pvalue = spstats.shapiro(y)[1] x_pvalues.append(x_pvalue) y_pvalues.append(y_pvalue) passed.append(min(x_pvalue, y_pvalue) >= .05) assert np.sum(np.array(passed)) >= .6 * n_test
def check_normality(): '''Check if the distribution is normal.''' # Set the parameters numData = 1000 myMean = 0 mySD = 3 # To get reproducable values, I provide a seed value np.random.seed(1234) # Generate and show random data data = stats.norm.rvs(myMean, mySD, size=numData) fewData = data[:100] plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(fewData) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lillifors(data) _, pFewVals['Lilliefors'] = lillifors(fewData) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm') print('p-values for all {0} data points: ----------------'.format(len(data))) print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals['Kolmogorov-Smirnov']
def test_basic(self): x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46, 4.43,0.21,4.75,0.71,1.52,3.24, 0.93,0.42,4.97,9.53,4.55,0.47,6.66] w,pw = stats.shapiro(x1) assert_almost_equal(w,0.90047299861907959,6) assert_almost_equal(pw,0.042089745402336121,6) x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11, 3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69, 0.08,3.67,2.81,3.49] w,pw = stats.shapiro(x2) assert_almost_equal(w,0.9590270,6) assert_almost_equal(pw,0.52460,3)
def main_plot_histogram(): sigma = 0.10 # initial setup W = dist_W(sigma) WI = dist_WI() #h, hist_edges = compute_histogram(W, WI, params) S, PS = compute_histogram(W, WI, params) S = S.flatten() PS = PS.flatten() #kindofvector(h) #kindofvector(hist_edges) #print(h) #print(hist_edges) #plt.plot(hist_edges, h) BINCNT = 100 plt.hist(S, bins=BINCNT, normed=True, histtype='step', alpha=1, label="act after tanh", color="b") plt.hist(PS, bins=BINCNT, normed=True, histtype='step', alpha=1, label="act before tanh", color="g") #W = shapiro(S) print("S size = ", S.size) print("shapiro S = ",shapiro(S)) print("shapiro PS = ",shapiro(PS)) stdS = std(S) print("stdS=",stdS) stdPS = std(PS) print("stdPS=", stdPS) x = linspace(-1, 1, 100) y = norm.pdf(x, loc=0, scale=stdS) plt.plot(x,y, color="b", alpha=0.2) y = norm.pdf(x, loc=0, scale=stdPS) plt.plot(x,y, color="g", alpha=0.2) #blue_line = mlines.Line2D([], [], color='blue', marker='.', markersize=15, label='Blue stars') plt.grid(True) plt.ylabel('density') plt.xlabel('activation value') plt.xlim([-1, 1]) plt.title('activation distibution in reservoir ($\sigma_{blue}$=%.2f, $\sigma_{green}$=%.2f)' % (stdS, stdPS)) plt.legend() plt.show()
def test_routehop_normality(rows, attributes, key): print "Splitting..." instances = split_on_attributes(attributes, rows) print "Processing..." toofew = 0 nonnormal = 0 normal = 0 for skey in instances.keys(): times = array([s[key] for s in instances[skey]]) n = len(times) mean_time = times.mean() std_time = times.std() if n >= 30: pval = stats.shapiro(times)[1] if pval < 0.05: nonnormal += 1 # figure() # hist(times) # title("%s (p-val=%f, %d pts)" %(str(skey),pval,n)); else: normal += 1 else: toofew += 1 print "Non,toofew,normal:", nonnormal, toofew, normal
def check_normality(): '''Check if the distribution is normal.''' # Generate and show a distribution numData = 100 # To get reproducable values, I provide a seed value np.random.seed(987654321) data = stats.norm.rvs(myMean, mySD, size=numData) plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['omnibus'] = stats.normaltest(data) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) # Or you can check for normality with Lilliefors-test ksStats, pVals['Lilliefors'] = kstest_normal(data) # Alternatively with original Kolmogorov-Smirnov test _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') print(pVals) if pVals['omnibus'] > 0.05: print('Data are normally distributed')
def test(self, alpha, x): """ Tests whether alpha and x are significantly correlated. The test assumes that x is normally distributed. The test function uses a Shapiro-Wilk test to test this assumption. :param alpha: independent variable, angles in radians :param x: dependent variable :return: test results of Shapiro-Wilk and Liddell-Ord test :rtype: pandas.DataFrame References: [Jammalamadaka2001]_ """ w, psw = stats.shapiro(x) if psw < 0.05: warnings.warn("This test requires Gaussian distributed x") rxc, rxs, rcs = np.corrcoef(x, np.cos(alpha))[0,1], np.corrcoef(x, np.sin(alpha))[0,1], \ np.corrcoef(np.cos(alpha), np.sin(alpha))[0,1] n = len(alpha) r2 = (rxc**2 + rxs**2 - 2*rxc*rxs*rcs)/(1 - rcs**2) f = (n-3)*r2/(1-r2) p = stats.f.sf(f, 2, n-3) df = pd.DataFrame(dict( test = ['Shapiro-Wilk','Liddell-Ord'], statistics = [w, f], p = [psw, p], dof = [None, (2, n-3)] )).set_index('test') return df
def robust_parameter(clusters, stats, elems): ''' Parameter to measure robustness of a G-mode test. The parameter is given by the weighted average plus a normality estimator: P1 = SUM( N * var ) / SUM( N ) P2 = SUM( N^-1 * var ) / SUM( N^-1 ) P3 = SUM( kstest(cluster, gaussian) ) P = (P1/w1 + P2/w2 + P3/w3) / (w1^-1 + w2^-1 + w3^-1) ''' from scipy.stats import shapiro from math import sqrt from itertools import izip shap, N, var = deque(), deque(), deque() for members, cl in izip(clusters, stats): # cluster size array N.append(len(members)) # cluster variance array var.append(asum(cl[1]**2)) # shapiro-wilk test: W_vec = array([shapiro(elems[members][n])[0]**2 for n in xrange(len(elems[0]))]) # inversed shapiro-wilk W statistic. shap.append( sqrt(asum(1e0/W_vec)) ) shap, N, var = array(shap), array(N), array(var) w1 = sqrt(asum(mad(var, median(var))**2)) w3 = mad(shap, median(shap)) p1 = asum( N * var ) / asum(N) p2 = asum( var/N ) / asum(1e0/N) p3 = median(shap) return (p1/w1 + p2/w1 + p3/w3) / (2e0/w1 + 1e0/w3)
def sw(errors): """ Shapiro Wilk Test The Null hypothesis for SW test is that the data forms a normal distribution. Parameters ------------- errors: error of voxels through time (shape of it is 221783*1) Returns --------- swstat: test statistics for SW test pval: P-value for the hypothesis test. """ pval = [] for i in range(errors.shape[-1]): pval.append(shapiro(errors[:,i])[1]) pval = np.array(pval) shap=pval.shape[0] pval = np.reshape(pval, (shap, 1)) return pval
def shapiro_test(self, param): from scipy.stats import shapiro all_values = self._get_single_param_values(param) results = [] for key, values in all_values: results.append((key, shapiro(sorted(values)))) return results
def _box_cox_transform(self, verbose=False, method='standard'): """ Performs the Box-Cox transformation, over different ranges, picking the optimal one w. respect to normality. """ from scipy import stats a = sp.array(self.values) if method == 'standard': vals = (a - min(a)) + 0.1 * sp.var(a) else: vals = a sw_pvals = [] lambdas = sp.arange(-2.0, 2.1, 0.1) for l in lambdas: if l == 0: vs = sp.log(vals) else: vs = ((vals ** l) - 1) / l r = stats.shapiro(vs) if sp.isfinite(r[0]): pval = r[1] else: pval = 0.0 sw_pvals.append(pval) i = sp.argmax(sw_pvals) l = lambdas[i] if l == 0: vs = sp.log(vals) else: vs = ((vals ** l) - 1) / l self._perform_transform(vs,"box_cox") log.debug('optimal lambda was %0.1f' % l) return True
def statFile(key, values, pruneX, pruneX2): oFilename = values["file"]+".csv" data = [] prune = False minX = 0 maxX = 0 if len(pruneX) > 0 and len(pruneX2) > 0: minX = float(pruneX) maxX = float(pruneX2) prune = True with open(oFilename) as f: for l in f.readlines(): arrLine = l.strip().split() if len(arrLine) == 2: t = float(arrLine[0]) if prune: if t >= minX and t <= maxX: data.append(float(arrLine[1])) elif t > maxX: break else: data.append(float(arrLine[1])) x = np.array(data) with open("stats.txt", "a") as f: f.write(key + ":\n") f.write(" mean: "+str(x.mean())+"\n") f.write(" std: "+str(x.std())+"\n") f.write(" median: "+str(np.median(x))+"\n") f.write(" min: "+str(x.min())+"\n") f.write(" max: "+str(x.max())+"\n") f.write(" normality: "+str(stats.shapiro(x)[1])+"\n")
def IsNormallyDistributed(sample, significance_level=0.05, return_p_value=False): """Calculates Shapiro-Wilk test for normality. Note that normality is a requirement for Welch's t-test. Args: sample: List of values of benchmark result for a measure. significance_level: The significance level the p-value is compared against. return_p_value: Whether or not to return the calculated p-value. Returns: is_normally_distributed: Returns True or False. p_value: The calculated p-value. """ if not stats: raise ImportError('This function requires Scipy.') # pylint: disable=unbalanced-tuple-unpacking _, p_value = stats.shapiro(sample) is_normally_distributed = p_value >= significance_level if return_p_value: return is_normally_distributed, p_value return is_normally_distributed
def gStats(self, missingValue=0.0): """dict of {geneID: (min,max,mean,median,std,stderr, Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...} """ import scipy as S import scipy.stats as SS rv = {} for k, v in self.items(): # print k,v va = S.array(self.gValues(k, missingValue)) try: normaltest = SS.normaltest(va) except: normaltest = None try: shapiro = SS.shapiro(va) except: shapiro = None try: rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro) except: print k, va raise return rv
def multiple_comp (residuals): """ input: residuals, 2d array (voxels,timecourse) output: a list of the number of voxels that being tested as not normally distributed, based on alpha-test, Bonferroni procedure, Hochberg procedure and Benjamini-Hochberg procedure respectively """ ## Alpha Test p_nor = [] for i in range(0,residuals.shape[0]): p_nor.append(stats.shapiro(residuals[i,:])[1]) # for p<0.05, the voxel is not normal distributed p_nor_005 = [i for i in p_nor if i < 0.05] ##Bonferroni Procedure p_bonf = [i for i in p_nor if i < (0.05 / residuals.shape[0])] ## Hochberg Procedure p_nors = np.sort(p_nor) alpha = 0.05 n=len(p_nors) tf=[] for i in range(0,n): thres = alpha/(n+1-(i+1)) tf.append(p_nors[i]<=thres) ##Benjamini-Hochberg procedure tf_bh=[] for i in range(0,len(p_nors)): thres = (i/n)*alpha tf_bh.append(p_nors[i]<=thres) return [len(p_nor_005),len(p_bonf),sum(tf),sum(tf_bh)]
def run(args): report = ResultReportWriter() team_names, results = rcss.run_matches(args.team_a, args.team_b, args.match_count) report.write_json('match_results.json', { 'binaries': [args.team_a, args.team_b], 'teams': team_names, 'results': results, }) errors = [] score = [x - y for x, y in results] # alpha, 1 - alpha # alpha = probability of rejecting a true null hypothesis significance, confidence = (args.significance, 1 - args.significance) _, normality_p = stats.shapiro(score) if normality_p <= significance: errors.append('Shapiro test rejected normality') mean = numpy.mean(score) std_error = stats.sem(score) confidence_interval = stats.t.interval(confidence, len(score) - 1, loc=mean, scale=std_error) report.write_json('statistics.json', { 'binaries': [args.team_a, args.team_b], 'teams': team_names, 'normality_p': normality_p, 'score': [confidence_interval[0], mean, confidence_interval[1]], 'score_std': std_error, 'params': { 'significance': args.significance, }, 'errors': errors, })
def pearson_or_shapiro(data): """pearson_or_shapiro Use D'agostino/Pearson if possible (n >= 20), else Shapiro :param data: """ return stats.normaltest(data) if len(data) >= 20 else stats.shapiro(data)
def test_nan_input(self): x = np.arange(10.) x[9] = np.nan w, pw = stats.shapiro(x) assert_equal(w, np.nan) assert_almost_equal(pw, 1.0)
def test_MultivariateNormalQMCEngineDegenerate(self, cuda=False): device = torch.device("cuda") if cuda else torch.device("cpu") for dtype in (torch.float, torch.double): # X, Y iid standard Normal and Z = X + Y, random vector (X, Y, Z) mean = torch.zeros(3, device=device, dtype=dtype) cov = torch.tensor( [[1, 0, 1], [0, 1, 1], [1, 1, 2]], device=device, dtype=dtype ) engine = MultivariateNormalQMCEngine(mean=mean, cov=cov, seed=12345) samples = engine.draw(n=2000) self.assertEqual(samples.dtype, dtype) self.assertEqual(samples.device.type, device.type) self.assertTrue(torch.all(torch.abs(samples.mean(dim=0)) < 1e-2)) self.assertTrue(torch.abs(torch.std(samples[:, 0]) - 1) < 1e-2) self.assertTrue(torch.abs(torch.std(samples[:, 1]) - 1) < 1e-2) self.assertTrue(torch.abs(torch.std(samples[:, 2]) - math.sqrt(2)) < 1e-2) for i in (0, 1, 2): _, pval = shapiro(samples[:, i].cpu().numpy()) self.assertGreater(pval, 0.9) cov = np.cov(samples.cpu().numpy().transpose()) self.assertLess(np.abs(cov[0, 1]), 1e-2) self.assertLess(np.abs(cov[0, 2] - 1), 1e-2) # check to see if X + Y = Z almost exactly self.assertTrue( torch.all( torch.abs(samples[:, 0] + samples[:, 1] - samples[:, 2]) < 1e-5 ) )
def most_normal_transformation(self, pid, trans_types=['none', 'sqrt', 'log', 'sqr', 'exp', 'arcsin_sqrt'], perform_trans=True, verbose=False): """ Performs the transformation which results in most normal looking data, according to Shapiro-Wilk's test """ #raw_values = self.phen_dict[pid]['values'] from scipy import stats shapiro_pvals = [] for trans_type in trans_types: if trans_type != 'none': if not self.transform(pid, trans_type=trans_type): continue phen_vals = self.get_values(pid) #print 'sp.inf in phen_vals:', sp.inf in phen_vals if sp.inf in phen_vals: pval = 0.0 else: r = stats.shapiro(phen_vals) if sp.isfinite(r[0]): pval = r[1] else: pval = 0.0 shapiro_pvals.append(pval) #self.phen_dict[pid]['values'] = raw_values if trans_type != 'none': self.revert_to_raw_values(pid) argmin_i = sp.argmax(shapiro_pvals) trans_type = trans_types[argmin_i] shapiro_pval = shapiro_pvals[argmin_i] if perform_trans: self.transform(pid, trans_type=trans_type) if verbose: print "The most normal-looking transformation was %s, with a Shapiro-Wilk's p-value of %0.6f" % \ (trans_type, shapiro_pval) return trans_type, shapiro_pval
def most_normal_transformation(self,trans_types=SUPPORTED_TRANSFORMATIONS, perform_trans=True, verbose=False): """ Performs the transformation which results in most normal looking data, according to Shapiro-Wilk's test """ from scipy import stats shapiro_pvals = [] for trans_type in trans_types: if trans_type == 'most_normal': continue if trans_type != 'none': if not self.transform(trans_type=trans_type): continue phen_vals = self.values #print 'sp.inf in phen_vals:', sp.inf in phen_vals if sp.inf in phen_vals: pval = 0.0 else: r = stats.shapiro(phen_vals) if sp.isfinite(r[0]): pval = r[1] else: pval = 0.0 shapiro_pvals.append(pval) if trans_type != 'none': self.revert_to_raw_values() argmin_i = sp.argmax(shapiro_pvals) trans_type = trans_types[argmin_i] shapiro_pval = shapiro_pvals[argmin_i] if perform_trans: self.transform(trans_type=trans_type) log.info("The most normal-looking transformation was %s, with a Shapiro-Wilk's p-value of %.2E" % \ (trans_type, shapiro_pval)) return trans_type, shapiro_pval
def distribution(self,gene,thresholdNorm): self.z,self.pval=stats.shapiro(gene[1:]) if self.pval<thresholdNorm: #print 'not normal distribution' return self.pval else: #print'normal' return self.pval
def test_normality_increase_lambert(): # Generate random data and check that it is more normal after inference for i, y in enumerate([np.random.standard_cauchy(size=ns), experimental_data]): print "Distribution %d" % i print "Before" print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(y)[0], shapiro(y)[0])).expandtabs(30) stats.probplot(y, dist="norm", plot=pylab) pylab.savefig("%d_before.png" % i) pylab.clf() tau = g.igmm(y) x = g.w_t(y, tau) print "After" print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(x)[0], shapiro(x)[0])).expandtabs(30) stats.probplot(x, dist="norm", plot=pylab) pylab.savefig("%d_after.png" % i) pylab.clf()
def plot_boxplots(df): # %% boxplot chemotherapy fig, ax = plt.subplots(figsize=(12, 10)) df_chemo = df.copy() df_chemo['Ablation Volume [ml] / Energy [kJ]'] = df_chemo[ 'Ablation Volume [ml]'] / df_chemo['Energy [kj]'] df_chemo.dropna(subset=['Ablation Volume [ml] / Energy [kJ]'], inplace=True) df_chemo.dropna(subset=['chemo_before_ablation'], inplace=True) df_chemo['chemo_before_ablation'].replace('No', False, inplace=True) df_chemo['chemo_before_ablation'].replace('Yes', True, inplace=True) df.dropna(subset=['Ablation Volume [ml]'], inplace=True) df.dropna(subset=['chemo_before_ablation'], inplace=True) df['chemo_before_ablation'].replace('No', False, inplace=True) df['chemo_before_ablation'].replace('Yes', True, inplace=True) # ttest no_chemo_df = df_chemo[df_chemo['chemo_before_ablation'] == False] no_chemo = no_chemo_df['Ablation Volume [ml]'].tolist() chemo_df = df_chemo[df_chemo['chemo_before_ablation'] == True] chemo = chemo_df['Ablation Volume [ml]'].tolist() fig, ax = plt.subplots(figsize=(12, 10)) plt.hist(no_chemo) plt.title('No Chemotherapy') plt.ylabel('Ablation Volume [ml]') figpathHist = os.path.join("figures", "histogram ablation volumes no chemo") gh.save(figpathHist, ext=['png'], close=True) fig1, ax = plt.subplots(figsize=(12, 10)) plt.hist(chemo) plt.title('Chemotherapy') plt.ylabel('Ablation Volume [ml] ') figpathHist = os.path.join("figures", "histogram ablation volumes chemo") gh.save(figpathHist, ext=['png'], close=True) print('no of tumors with chemo:', str(len(chemo))) print('no of tumors with no chemo:', str(len(no_chemo))) # stat, p_chemo = shapiro(chemo) # interpret alpha_chemo = 0.05 if p_chemo > alpha_chemo: msg = 'Sample Chemo looks Gaussian (fail to reject H0)' else: msg = 'Sample Chemo does not look Gaussian (reject H0)' print(msg) stat, p_no_chemo = shapiro(no_chemo) # interpret alpha_no_chemo = 0.05 if p_no_chemo > alpha_no_chemo: msg = 'Sample No Chemo looks Gaussian (fail to reject H0)' else: msg = 'Sample No Chemo does not look Gaussian (reject H0)' print(msg) if p_no_chemo < alpha_no_chemo and p_chemo < alpha_chemo: t, p = stats.mannwhitneyu(chemo, no_chemo) print( 'mann withney u test applied for samples coming from a non Gaussian distribution:' ) print("t = " + str(t)) print("p = " + str(p)) else: t, p = stats.ttest_ind(chemo, no_chemo) print('ttest applied for samples coming from a Gaussian distribution:') print("t = " + str(t)) print("p = " + str(p)) fig, ax = plt.subplots(figsize=(12, 10)) bp_dict = df.boxplot(column=['Ablation Volume [ml]'], ax=ax, notch=True, by='chemo_before_ablation', patch_artist=True, return_type='both') ax.set_xlabel('') plt.show() for row_key, (ax, row) in bp_dict.iteritems(): for i, box in enumerate(row['fliers']): box.set_marker('o') for i, box in enumerate(row['boxes']): if i == 0: box.set_facecolor('Purple') box.set_edgecolor('DarkMagenta') else: box.set_facecolor('LightPink') box.set_edgecolor('HotPink') for i, box in enumerate(row['medians']): box.set_color(color='Black') box.set_linewidth(2) for i, box in enumerate(row['whiskers']): box.set_color(color='Black') box.set_linewidth(2) xticklabels = [ 'No Chemotherapy before Ablation', 'Chemotherapy Administered before Ablation' ] xtickNames = plt.setp(ax, xticklabels=xticklabels) plt.setp(xtickNames, fontsize=10, color='black') plt.ylim([-2, 120]) plt.ylabel('Ablation Volume [ml]', fontsize=12, color='k') plt.tick_params(labelsize=10, color='black') ax.tick_params(colors='black', labelsize=10, color='k') ax.set_ylim([-2, 120]) plt.xlabel('') fig.suptitle('') plt.title('') # plt.title('Comparison of Ratio (Ablation Volumes [ml] : Energy [kJ]) from MAVERRIC Dataset by Chemotherapy', fontsize=12) plt.title( 'Comparison of Ablation Volumes [ml] from MAVERRIC Dataset by Chemotherapy', fontsize=12) figpathHist = os.path.join( "figures", "boxplot ablation volumes by chemo before ablation") gh.save(figpathHist, ext=['png'], close=True) # %% BOXPLOTS ABLATION VOLUMES # ttest df_volumes = df.copy() df_volumes.dropna(subset=['Ablation Volume [ml]'], inplace=True) df_volumes.dropna(subset=['Ablation Volume [ml] (manufacturers)'], inplace=True) ablation_vol = df_volumes['Ablation Volume [ml]'].tolist() ablation_vol_brochure = df_volumes[ 'Ablation Volume [ml] (manufacturers)'].tolist() stat, p_brochure = shapiro(ablation_vol_brochure) # interpret alpha_brochure = 0.05 if p_brochure > alpha_brochure: msg = 'Sample Ablation Volume Brochure looks Gaussian (fail to reject H0)' else: msg = 'Sample Ablation Volume Brochure does not look Gaussian (reject H0)' print(msg) stat, p_voxel = shapiro(ablation_vol) # interpret alpha_voxel = 0.05 if p_voxel > alpha_voxel: msg = 'Sample Ablation Volume looks Gaussian (fail to reject H0)' else: msg = 'Sample Ablation Volume does not look Gaussian (reject H0)' print(msg) if p_voxel < alpha_voxel and p_brochure < alpha_brochure: t, p = stats.mannwhitneyu(ablation_vol, ablation_vol_brochure) print( 'mann withney u test applied for samples coming from a non Gaussian distribution:' ) print("t = " + str(t)) print("p = " + str(p)) else: t, p = stats.ttest_ind(ablation_vol, ablation_vol_brochure) print('ttest applied for samples coming from a Gaussian distribution:') print("t = " + str(t)) print("p = " + str(p)) fig, ax = plt.subplots(figsize=(12, 10)) bp_dict = df.boxplot(column=[ 'Ablation Volume [ml]', 'Ablation Volume [ml] (parametrized_formula)', 'Ablation Volume [ml] (manufacturers)' ], ax=ax, notch=True, patch_artist=True, return_type='both') ax.set_xlabel('') row = bp_dict.lines # for idx,row in enumerate(lines): for i, box in enumerate(row['fliers']): box.set_marker('o') # box.set_edgecolor('RoyalBlue') for i, box in enumerate(row['boxes']): if i == 0: box.set_facecolor('Blue') box.set_edgecolor('MediumBlue') elif i == 1: box.set_facecolor('BlueViolet') box.set_edgecolor('BlueViolet') elif i == 2: box.set_facecolor('DeepSkyBlue') box.set_edgecolor('DodgerBlue') for i, box in enumerate(row['medians']): box.set_color(color='Black') box.set_linewidth(2) for i, box in enumerate(row['whiskers']): box.set_color(color='Black') box.set_linewidth(2) xticklabels = [ 'Ablation Volume [ml] (Voxel-Based)', 'Ablation Volume [ml] (Ellipsoid Formula)', 'Ablation Volume [ml] (Manufacturers Brochure)' ] xtickNames = plt.setp(ax, xticklabels=xticklabels) plt.setp(xtickNames, fontsize=10, color='black') plt.ylim([-2, 150]) plt.ylabel('Ablation Volume [ml]', fontsize=14, color='k') plt.tick_params(labelsize=10, color='black') ax.tick_params(colors='black', labelsize=10, color='k') ax.set_ylim([-2, 150]) plt.title('Comparison of Ablation Volumes [ml] from MAVERRIC Dataset', fontsize=16) figpathHist = os.path.join("figures", "boxplot volumes") gh.save(figpathHist, ext=['png'], close=True)
def nortest(df, a): _, sw = shapiro(df[a]) _, ap = normaltest(df[a]) index = ['Shapiro-Wilk', 'D\'Agostino-Pearson'] columns = ['p-value'] return pd.DataFrame([ap, sw], index=index, columns=columns)
df = pd.read_csv('datasets/cats-data.csv', sep=",", index_col=0) print(df) df_female = df[df["Sex"] == "F"] df_male = df[df["Sex"] == "M"] def test_normal_distribution(p_group, alpha): return p_group > alpha def test_hipothesis(p, alpha): return p > alpha W, p_female = st.shapiro(df_female["Hwt"]) print('For female cats normal distribution test result is:', test_normal_distribution(p_female, alpha)) W, p_male = st.shapiro(df_male["Hwt"]) print('For male cats normal distribution test result is:', test_normal_distribution(p_male, alpha)) t, p = st.ttest_ind(df_female["Hwt"], df_male["Hwt"]) hypothesis_result = test_hipothesis(p, alpha) print("Hipothesis that Heart weight for male and female heart is equal is: ", hypothesis_result) def display_hist(data_female, data_male): data.plot.hist(bins=40) plt.legend(loc="upper right")
# H0 : M1=M2 ("There is no statistically significant difference between the Purchase averages of the two groups.") # H1: M1 != M2 ("There is a statistically significant difference between the Purchase averages of the two groups.") """ # 2. Assumption Control # 2.1. Normality Assumption (shapiro) # Shapiro Wilk Test is used for the assumption of normality. """ # Defining hypothesis theses for the assumption of normality. # H0 : Normality assumption is provided for this sample. # H1 : Normality assumption is not provided for this sample. """ hf.hypothesis_test( shapiro(A)) # P-value = 0.5891, so that H0 can NOT be REJECTED! hf.hypothesis_test( shapiro(B)) # P-value = 0.1541, so that H0 can NOT be REJECTED! #Normality assumption is provided for both samples. # 2.2 Variance Homogenity Assumption (levene) """ # Defining hypothesis theses for the Variance Homogenity Assumption. # H0 : Variance homogeneity assumption is provided. # H1 : Variance homogeneity assumption is NOT provided. """ hf.hypothesis_test(stats.levene( A, B)) # P-value = 0.1083, so that H0 can NOT be REJECTED!
def q1(): # Retorne aqui o resultado da questão 1. p_value = sct.shapiro(sample_height)[1] return bool(p_value > 0.05)
def q1(): _, pvalue = sct.shapiro(get_sample(athletes, 'height', n=3000)) return pvalue > 0.05
interaction_len.append(max(times) if len(times) == 2 else sum(times[1:])) interaction_seq.append(interaction_count) appear_seq.append(appear_count) return [interaction_seq[i] / appear_seq[i] for i in range (len(interaction_seq))], \ interaction_len inter_per_class = {0:[], 1:[]} propor_b, len_b = interact_length(inter_before, labels[:197]) propor_a, len_a = interact_length(inter_after, labels[197:]) # %% # Normality | Non-parametric tests import scipy.stats as stats statistics_b, pvals = stats.shapiro(propor_b) print (f'p-value (Shapiro Before): {pvals}, S: {statistics_b}') print (f'df: {len(propor_b)}') statistics_a, pvals = stats.shapiro(propor_a) print (f'p-value (Shapiro After): {pvals}, S: {statistics_a}') print (f'df: {len(propor_a)}') statistics_u, pvals = stats.mannwhitneyu(propor_b, propor_a, alternative = 'less') print ('p-value (Mann-Whitney U test): \t', pvals) print (f'Before: {np.median(propor_b)}, After: {np.median(propor_a)}') print (f'U: {statistics_u}') # %% df = pd.DataFrame(columns = ['Proportion', 'Treatment', 'Color']) df['Treatment'] = ['Dataset 1'] * len(propor_b) + ['Dataset 2'] * len(propor_a)
y_pred_L = gauss_to_pi(y_pred_gauss_mid_all, y_pred_gauss_dev_all, n_std_devs) # work out metrics y_U_cap = y_pred_U > y_val.reshape(-1) y_L_cap = y_pred_L < y_val.reshape(-1) y_all_cap = y_U_cap * y_L_cap PICP = np.sum(y_all_cap) / y_L_cap.shape[0] MPIW = np.mean(y_pred_U - y_pred_L) y_pred_mid = np.mean((y_pred_U, y_pred_L), axis=0) MSE = np.mean(np.square(Gen.scale_c * (y_pred_mid - y_val[:, 0]))) RMSE = np.sqrt(MSE) CWC = np_QD_loss(y_val, y_pred_L, y_pred_U, alpha, soften, lambda_in) neg_log_like = gauss_neg_log_like(y_val, y_pred_gauss_mid, y_pred_gauss_dev, Gen.scale_c) residuals = residuals = y_pred_mid - y_val[:, 0] shapiro_W, shapiro_p = stats.shapiro(residuals[:]) results_runs.append( (PICP, MPIW, CWC, RMSE, neg_log_like, shapiro_W, shapiro_p)) # concatenate for graphs title = 'PICP=' + str(round(PICP,3))\ + ', MPIW=' + str(round(MPIW,3)) \ + ', qd_loss=' + str(round(CWC,3)) \ + ', NLL=' + str(round(neg_log_like,3)) \ + ', alpha=' + str(alpha) \ + ', loss=' + NN.loss_type \ + ', data=' + type_in + ',' \ + '\nh_size=' + str(NN.h_size) \ + ', bstraps=' + str(n_bootstraps) \ + ', ensemb=' + str(n_ensemble) \ + ', RMSE=' + str(round(RMSE,3)) \
n, then a must have length n/2. reta : bool, optional Whether or not to return the internally computed a values. The default is False. Returns ------- W : float The test statistic. p-value : float The p-value for the hypothesis test. a : array_like, optional If `reta` is True, then these are the internally computed "a" values that may be passed into this function on future calls. """ from scipy import stats from matplotlib.finance import quotes_historical_yahoo import numpy as np ticker = 'IBM' begdate = (2009, 1, 1) enddate = (2013, 12, 31) p = quotes_historical_yahoo(ticker, begdate, enddate, asobject=True, adjusted=True) ret = (p.aclose[1:] - p.aclose[:-1]) / p.aclose[1:] print 'ticker=', ticker, 'W-test, and P-value' print stats.shapiro(ret)
### Normality Tests ##### Histogram In repository - Python-DataScience-CookBook/Exploratory Data Analysis.py import seaborn as sns sns.distplot(Df.Var.dropna()) ##### Q-Q Plot import numpy as np import pylab import scipy.stats as stats stats.probplot(Df.Var, dist="norm", plot=pylab) pylab.show() ##### Normal Test k2, p = stats.normaltest(Energy.x) # k2 value corresponds to statistic value & p-value>0.05 implies data is normally distribution ##### Shapiro-Wilk Test - https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html from scipy import stats w,p = stats.shapiro(Df.Var) ##### D’Agostino’s K^2 Test - Kolmogorov-Smirnov test for goodness of fit stats.kstest(Df.Var,'norm') ##### Anderson-Darling Test stats.anderson((Df.Var,'norm') ### Correlation Tests # H0: Two samples are independent # H1: There is a dependency between the samples ##### Pearson’s Correlation Coefficient corr, p = pearsonr(Df.Var1, Df.Var2) ##### Spearman’s Rank Correlation corr, p = spearmanr(Df.Var1, Df.Var2) ##### Kendall’s Rank Correlation corr, p = kendalltau(Df.Var1, Df.Var2)
df = df.dropna() df = df.iloc[:, 2:].apply( lambda x: x.astype(str).str.replace(',', '.').astype(float)) # 检验人均GDP和手机使用率的相关性 sns.regplot(x='GDP ($ per capita)', y='Phones (per 1000)', data=df) plt.show() ''' 看起来有些像是线性关系,但是方差随着变量的值有所变动,看起来并不是同方差。另外,我们得检验一下,两个变量是不是接近正态分布的。 Scipy.stats中有多个方法可以用来检验正态分布,比如normaltest() 、shapiro()、kstest(rvs='norm')等,这里我们选用shapiro(), 分别检验各国人均GDP和手机使用率是否符合正态分布。 原假设:样本来自一个正态分布的总体。 备选假设:样本不来自一个正态分布的总体。 ''' print(stats.shapiro(df['GDP ($ per capita)'])) # (0.8052586317062378, 3.5005310282387736e-14) print(stats.shapiro(df['Phones (per 1000)'])) # (0.8678628206253052, 2.0484371143769664e-11) # 返回的结果是一个包含统计量w和p-值的元组。可以看到,p-值非常小,接近于0,于是可以拒绝原假设。 # 我们认为各国人均GDP和手机使用率都不符合正态分布。 # 用Pandas计算相关系数 ''' 低度相关:0 <= |r| <= 0.3 中度相关:0.3 <= |r| <= 0.8 高度相关:0.8 <= |r| <= 1 ''' # 因 各国人均GDP和手机使用率都不符合正态分布,所以 不适用皮尔森相似度pearson print(df['GDP ($ per capita)'].corr(df['Phones (per 1000)'], method='pearson')) # 0.88352010541116632
###########################1########################### import pandas as pd import scipy from scipy import stats cutlets=pd.read_csv("C:\\Users\\jzsim\\Downloads\\Cutlets.csv") #as there are 2 population here comparion with each other. # checking if both are following normal distribtuion or not. # doing the same by shapiro test #H0 : Follworing normal distribution #Ha : Not Follworing normal distribution print(stats.shapiro(cutlets['Unit A'])) # p Value: 0.3199819028377533 #as P value is greater than 0.05 # P high Null Fly print(stats.shapiro(cutlets['Unit B'])) # p Value: 0.3199819028377533 #as P value is greater than 0.05 # P high Null Fly #AS BOTH P VALUES ARE GREATER THAN 0.05 P HIGH NULL FLY #DATA IS FOLLWOING NORMAL DISTRIBUTION #are external conditions same --> No # Checking Variances are equal or not #H0 : VAriances are equal
def resid_proc(reis, remove_zero_wt, grpfiles, pareto, groups_rei): print "aggregating statistics and plotting by observation group..." print "PEST iteration:" for cf in reis: print '{0} '.format(cf), infile = reis[cf] # open a pointer to the output file rei_summary_folder='residuals_summaries' if not os.path.exists(rei_summary_folder): os.makedirs(rei_summary_folder) ofp = open(os.path.join(rei_summary_folder,infile + '_residuals_summary.dat'),'w') ofp.write('Residuals Summary information for -> ' + infile + '\n') # read in the data alldat = np.genfromtxt(infile,names=True,skip_header=4,dtype=None) # if processing PEST pareto results, read in groups from another REI if pareto: try: rei_groups_df = pd.read_csv(groups_rei, delim_whitespace=True, skiprows=6, index_col='Name') #if np.isnan(np.max(rei_groups_df.ix[:,0])): #rei_groups_df = rei_groups_df[rei_groups_df.columns[1:]] # for observations that were read in, reassign the entry in 'Group' column to group from other REI for observation in alldat: observation['Group'] = rei_groups_df.ix[observation['Name'], 'Group'] except IOError: print "Cannot open {0}. Please provide an non-pareto REI file so that observations can be analyzed by group." quit() # find the unique list of groups by which plots and stats will be managed allgrps = np.unique(alldat['Group']) allgrps = [g for g in allgrps if 'regul' not in g] # loop over the groups for cg in allgrps: # identify indices of the current group tmpinds = np.nonzero(alldat['Group']==cg)[0] if remove_zero_wt: inds = tmpinds[np.nonzero(alldat['Weight'][tmpinds] != 0)] # not sure what the "remove_zero_weight" option is for, but for groups # that are zero weighted, it results in an empty "inds" array, causing python to crash if len(inds)==0: inds = tmpinds else: inds = tmpinds # pull out the measured values for the group cmeas = alldat['Measured'][inds] # pull out the modeled values for the group cmod = alldat['Modelled'][inds] #get some values to limit plotting areas try: cmin = np.min([cmeas,cmod]) cmax = np.max([cmeas,cmod]) # if the last rei is from an iteration where PEST failed, will have unreasonable values (i.e. -1e300) # that will cause a TypeError here except TypeError: continue # now calculate statistics on the residuals # first grab the residuals cres = alldat['Residual'][inds] # next calculate the relevant statistics and write to the output file cmean = np.mean(cres) cstd = np.std(cres) cvar = np.var(cres) cmed = np.median(cres) cmin = np.min(cres) camin = np.min(np.abs(cres)) cmax = np.max(cres) camax = np.max(np.abs(cres)) if len(grpfiles) > 1: # make a plot of modeled vs. measured plt.figure() plt.hold = True plt.plot(cmeas,cmod,'bx') plt.plot([cmin,cmax],[cmin,cmax],'r') plt.title('Observation Group "%s", PEST iteration %s' %(cg, cf)) plt.xlabel('Measured') plt.ylabel('Modeled') # append the histograms into the proper PDF file grpfiles[cg][0].savefig() #plt.close() # finally plot the histogram and save it fig = plt.figure() ax = fig.add_subplot(111) n, bins, patches = ax.hist(cres, 50, facecolor='blue', alpha=0.75) ax.set_xlabel('Residual Value') ax.set_ylabel('Count') ax.set_title(cg + ' iteration ' + str(cf)) ax.set_xlim([cmin,cmax]) # append the histograms into the proper PDF file grpfiles[cg][-1].savefig() #plt.close() # perform the Shapiro-Wilks test for normality of the residuals if len(cres)>2: W,p = shapiro(cres) if len(cres) > 2: W,p = shapiro(cres) else: p = -99999 # write to the summary output file ofp.write(25*'#' + '\n') ofp.write('Summary Statistics for Residuals: -> group ' + cg +'\n') ofp.write('%14s : %f\n' %('mean',cmean)) ofp.write('%14s : %f\n' %('median',cmed)) ofp.write('%14s : %f\n' %('std deviation',cstd)) ofp.write('%14s : %f\n' %('variance',cvar)) ofp.write('%14s : %f\n' %('min',cmin)) ofp.write('%14s : %f\n' %('max',cmax)) ofp.write('%14s : %f\n' %('min (absolute)',camin)) ofp.write('%14s : %f\n' %('max (absolute)',camax)) if p > 0.05: ofp.write('Residuals are not normally distributed\n') else: ofp.write('Residuals are normally distributed\n') ofp.write('p-value = %f' %(p)) if p > 0.05: ofp.write('Residuals are not normally distributed\n') ofp.write('p-value = %f' %(p)) elif p < -99: ofp.write('Residuals normality not calculable: Too few residuals in group\n') else: ofp.write('Residuals are normally distributed\n') ofp.write('p-value = %f' %(p)) ofp.write(3*'\n') ofp.close() # close the PDF files for cg in grpfiles: for i in range(len(grpfiles)): grpfiles[cg][i].close()
#Superposition des lignes de régression chez les Oecanthus exclamationis (rouge) et chez les Oecanthus niveus (bleu) ax1 = sns.regplot(x="TempEx", y="ImpulsionEx", data=Crickets, color='r') ax2 = sns.regplot(x="TempNiv", y="ImpulsionNiv", data=Crickets, color='b') # La ligne de régression pour Oecanthus exclamationis est plus élevée que la ligne pour Oecanthus niveus; cela signifie que Oecanthus exclamationis aurait un taux de pouls plus élevé à n’importe quelle température. # La première hypothèse nulle de l’ancova est que les pentes des lignes de régression sont toutes égales; en d’autres termes, que les lignes de régression sont parallèles les unes aux autres. On va acceptez l’hypothèse nulle selon laquelle les lignes de régression sont parallèles et nous testerons la deuxième hypothèse nulle : que les interceptions des lignes de régression sont toutes les mêmes. # Les pentes ne sont pas significativement différentes (P=0,25); la pente commune est de 3,60, ce qui se trouve entre les pentes pour les lignes séparées (3,52 et 3,75). Sur cette partie-là, je n'ai pas réussi à tester cette hypothèse. # Ancova fait les mêmes hypothèses que la régression linéaire : normalité et homoscédasticité de Y pour chaque valeur de X, et indépendance. Vérifions au moins l'hypothèse de normalité. # In[202]: #Test de Shapiro chez les Oecanthus exclamationis stats.shapiro(model1.resid) # W= 0.9727, p= 0.9105 donc les résidus sont bien distribués suivant la loi normale chez les Oecanthus exclamationis # In[203]: #Test de Shapiro chez les Oecanthus niveus stats.shapiro(model2.resid) # W= 0.9159, p= 0.1259 donc les résidus sont bien distribués suivant la loi normale chez les Oecanthus niveus # Maintenant procédons à un test de Tukey sous l'hypothèse que leurs pentes sont toutes les mêmes # In[188]: from statsmodels.stats.multicomp import pairwise_tukeyhsd
def meta_process(tau): ''' Main processing kernel ''' print('Analyzing tau (ms): ', tau) import warnings warnings.filterwarnings('ignore') # Folder where you store the PLT positions (center of mass - COM) per DNS time steps data_location = which_bodies + '_tau_' + str(tau) + '/' numBodies = 0 Bodies = [] for log in sorted(glob.glob(data_location + '*_ComPos.log'), key=numericalSort): Bodies.append(numBodies) numBodies += 1 # Build the data frames and fill them absolute_pos = pd.DataFrame(columns=Bodies, dtype=np.float64) distFromWalls = pd.DataFrame(columns=Bodies, dtype=np.float64) MSD = pd.DataFrame(columns=Bodies, dtype=np.float64) # Perform distributions checking in zones zones_vels = [] zones_distros = [] zones_MSD = [] zones_distFromWalls = [] for z in range(zones_): zones_distros.append(np.array([], dtype=np.float64)) zones_vels.append(np.array([], dtype=np.float64)) zones_MSD.append(np.array([], dtype=np.float64)) zones_distFromWalls.append(np.array([], dtype=np.float64)) # Var that help us find the mean free path/ time (MFP/T) in comparison with the ground truth (gT, path from DNS) integrals_tau = [] numBodies = 0 for log in sorted(glob.glob(data_location + '*_ComPos.log'), key=numericalSort): df = pd.read_csv(log, delimiter=',', header=None, names=names_, usecols=usecols_, dtype={'t': np.float64, 'y': np.float64, 'z': np.float64}) # Time in the original files interprets to how many DNS fluid time steps, # this is why we multiply here with DNS fluid time step to convert it into physical time in ms df = df.loc[df['t']*dt_f >= From_] df = df.loc[df['t']*dt_f <= To_] df = df.reset_index(drop=True) absolute_pos[numBodies] = df['y'].copy() if (do_what == 'MFP'): integrals_tau.append(np.trapz(df['y'], df['t']*dt_f)) if ( (do_what == 'distros') or (do_what == 'MSD') or (do_what == 'distFromWalls') ): MSD[numBodies] = pd.Series((df['y'] - df['y'].iloc[0]) * (df['y'] - df['y'].iloc[0])) distFromWalls[numBodies] = df['y'].apply(lambda y: (y - bottom_wall) if ( (y - bottom_wall) < (top_wall - y) ) else (top_wall - y)) pos = absolute_pos[numBodies].to_numpy() pos_rolled = np.roll(pos, 1) # velocity in um/ms vel = (pos - pos_rolled) / tau vel[0] = np.nan # Exclude erroneous jumps dp = np.absolute(pos-pos_rolled) inds = np.where(dp < ((top_wall - bottom_wall) - 5.0)) pos = pos[inds] vel = vel[inds] inds = np.where((~np.isnan(vel)) & (~np.isinf(vel))) pos = pos[inds] vel = vel[inds] zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1) for z in range(zones_): inds = np.where((pos >= zones_tmp[z]) & (pos < zones_tmp[z+1])) zones_distros[z] = np.append(zones_distros[z], vel[inds]) numBodies += 1 ####################################################################### ####################################################################### df_t = np.arange(From_, To_+tau, tau) ####################################################################### ####################################################################### # Compute MSD & distFromWalls per Zone if ( (do_what == 'distros') or (do_what == 'MSD') or (do_what == 'distFromWalls') ): zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1) MSD_t_avg = [] distFromWalls_t_avg = [] for z in range(zones_): MSD_t_avg.append([]) distFromWalls_t_avg.append([]) for i, t_ in enumerate(df_t): for b_ in range(len(Bodies)): try: pos = absolute_pos[b_].iloc[i] except: continue for z in range(zones_): if ( (pos >= zones_tmp[z]) and (pos < zones_tmp[z+1]) ): MSD_t_avg[z].append(MSD[b_].iloc[i]) distFromWalls_t_avg[z].append(distFromWalls[b_].iloc[i]) for z in range(zones_): # If no particles in the zone, then np.mean returns nan zones_MSD[z] = np.append(zones_MSD[z], np.mean(MSD_t_avg[z])) MSD_t_avg[z] = [] zones_distFromWalls[z] = np.append(zones_distFromWalls[z], np.mean(distFromWalls_t_avg[z])) distFromWalls_t_avg[z] = [] # Cleaning for z in range(zones_): if (np.where(np.isnan(zones_MSD[z]))[0].shape[0] != 0): zones_MSD[z] = zones_MSD[z][:np.where(np.isnan(zones_MSD[z]))[0][0]] if (np.where(np.isnan(zones_distFromWalls[z]))[0].shape[0] != 0): zones_distFromWalls[z] = zones_distFromWalls[z][:np.where(np.isnan(zones_distFromWalls[z]))[0][0]] for z in range(zones_): from scipy import optimize # non_linear fitting def non_linear_(x, a, b): return a*np.power(x, b) # linear fitting def linear_(x, a, b): return a*x + b # MSD Y = zones_MSD[z] X = np.copy(df_t)[:Y.shape[0]] X -= From_ best_vals_non_linear, _ = optimize.curve_fit(non_linear_, X, Y) #best_vals_linear , _ = optimize.curve_fit(linear_ , X, Y) zones_MSD[z] = tuple(best_vals_non_linear) if (do_what == 'MSD'): # Dump data #np.savetxt(fName_ + '.csv', np.array(list(zip(X,Y))), delimiter=',') plt.plot(X,Y) plt.plot(X, non_linear_(X, *best_vals_non_linear), linestyle='--', label='non-linear (a*x^b), params as [a,b] : '+str(best_vals_non_linear)) #plt.plot(X, linear_(X, *best_vals_linear) , linestyle='--', label='linear (a*x + b), params as [a,b] : '+str(best_vals_linear)) plt.legend() plt.show() # distFromWalls Y = zones_distFromWalls[z] X = np.copy(df_t)[:Y.shape[0]] X -= From_ #best_vals_non_linear, _ = optimize.curve_fit(non_linear_, X, Y) best_vals_linear , _ = optimize.curve_fit(linear_ , X, Y) zones_distFromWalls[z] = tuple(best_vals_linear) if (do_what == 'distFromWalls'): # Dump data #np.savetxt(fName_ + '.csv', np.array(list(zip(X,Y))), delimiter=',') plt.plot(X,Y) #plt.plot(X, non_linear_(X, *best_vals_non_linear), linestyle='--', label='non-linear (a*x^b), params as [a,b] : '+str(best_vals_non_linear)) plt.plot(X, linear_(X, *best_vals_linear) , linestyle='--', label='linear (a*x + b), params as [a,b] : '+str(best_vals_linear)) plt.legend() plt.show() ####################################################################### ####################################################################### if (do_what == 'distros'): # significance level for p-values sign_lvl = 0.1 # For the PLT random walk simulations distros_invECDF = [] distros_tail = [] xmins = [] zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1) for z in range(zones_): print("#######################################################################") print("Zone ", z) print("Limits: (", zones_tmp[z], ",", zones_tmp[z+1], ")") print('------------------------------------------------------------') data = np.absolute(zones_distros[z]) print("Mean absolute velocity (current zone) [um/ms] : ", np.mean(data)) print("Diffusion Coefficient (v^2*dt*0.5) [um^2/ms] : ", (np.mean(data)**2.)*tau*0.5) print("MSD non-linear fitting (a*x^b), params as (a,b) [um^2,ms] : ", zones_MSD[z]) print("Avg Distance from Walls linear fitting (a*x + b), params as (a,b) [um,ms] : ", zones_distFromWalls[z]) print('------------------------------------------------------------') print("Checking for sign.") data = zones_distros[z] sign_ = np.sign(data) positive_ = sign_[sign_ > 0.] negative_ = sign_[sign_ < 0.] print('Positive velocities (%) : ' , round(positive_.shape[0]/sign_.shape[0], 2) * 100.) print('Negative velocities (%) : ' , round(negative_.shape[0]/sign_.shape[0], 2) * 100.) print('------------------------------------------------------------') print("Checking for normality.") not_normal = 0 normal = 0 # Shapiro-Wilk Test stat, p = stats.shapiro(data) if (p > sign_lvl): normal += 1 else: not_normal += 1 # D’Agostino’s K^2 Test stat, p = stats.normaltest(data) if (p > sign_lvl): normal += 1 else: not_normal += 1 # Anderson-Darling Test result = stats.anderson(data) for i in range(len(result.critical_values)): if result.statistic < result.critical_values[i]: normal += 1 else: not_normal += 1 kurt = stats.kurtosis(data) print('kurtosis of dataset (whole range, i.e., body & tail) : ', kurt) print('Number of successful normality tests : ', normal) print('Number of failed normality tests : ', not_normal) print("End of Checking for normality.") print('------------------------------------------------------------') print("Analyze the tail of the distribution.") data = np.absolute(zones_distros[z]) from statsmodels.distributions.empirical_distribution import ECDF, monotone_fn_inverter data.sort() # in-place sorting ecdf = ECDF(data) inv_ecdf = monotone_fn_inverter(ecdf, data) distros_invECDF.append({'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}) ####################################################################### tail_P = 0.90 # no need to search the whole domain for the lower bound (x_min). Search from the 90th percentile and above. print("Number of samples to do statistics (whole range, i.e., body & tail) : ", data.shape[0]) print("Number of samples to do statistics (tail-only) : ", data[data >= inv_ecdf(tail_P)].shape[0]) ####################################################################### print('------------------------------------------------------------') # https://en.wikipedia.org/wiki/Heavy-tailed_distribution#Common_heavy-tailed_distributions # We focus on fat-tails and more specifically on power laws (see paper for more) # heavy-tails term: kept it for legacy reasons wikipedia_heavy_tailed_distros = [ 'halfcauchy', 'burr12', 'burr', 'pareto', 'lognorm', 'weibull_min', 'fisk', 'invweibull', 'levy', 'invgauss' # see Klaus_2011 (Statistical Analyses Support Power Law Distributions Found in Neuronal Avalanches) ] handpicked_distros = wikipedia_heavy_tailed_distros + ['expon', 'halfnorm'] for dist_name in handpicked_distros: print(dist_name) distro = eval('stats.' + dist_name) ''' if (distro.numargs >= 2): print('Skip distro.') print('Avoid overfitting from distros with multiple parameters (numargs >= 2).') print('------------------------------------------------------------') continue ''' if ( (distro.a < 0.) or (distro.b != np.inf) ): print('Skip distro.') print('Bounds not appropriate.') print('------------------------------------------------------------') continue ####################################################################### # Optimal fitting # Computationally expensive part! if (dist_name != 'halfnorm'): xmin_optimal = find_xminOpt_distro(data[data >= inv_ecdf(tail_P)], dist_name) else: xmin_optimal = 0. ####################################################################### ####################################################################### # Relaxed fitting based on optimal one # When ecdf(xmin_opt) > 95%, it's a good idea to try a relaxed version # at ecdf(xmin_opt) ~ 90% if (dist_name != 'halfnorm'): # round down tail_i = 0.04 xmin_relaxed_lb = inv_ecdf( (int(ecdf(xmin_optimal)*10.) / 10.) - tail_i/2. ) xmin_relaxed_ub = inv_ecdf( (int(ecdf(xmin_optimal)*10.) / 10.) + tail_i/2. ) # More educated choice of xmin_relaxed xmin_relaxed = find_xminOpt_distro(data[data >= xmin_relaxed_lb], dist_name, xmin_relaxed_ub) else: xmin_relaxed = 0. ####################################################################### data_optimal = data[data >= xmin_optimal] params_optimal = distro.fit(data_optimal) data_relaxed = data[data >= xmin_relaxed] params_relaxed = distro.fit(data_relaxed) #*** KS-test p_val_optimal = stats.kstest(data_optimal, dist_name, params_optimal)[1] p_val_relaxed = stats.kstest(data_relaxed, dist_name, params_relaxed)[1] #*** strongly_rejected_opt = False negative_d = 'None' negative_p = 1. for dist_name_ in handpicked_distros: if (dist_name_ == dist_name): continue # Check dist_name vs dist_name_ # Which model is better fit LLR, p = LLR_test(data_optimal, dist_name, dist_name_) if ( (LLR < 0.) and (p < negative_p) ): negative_d = dist_name_ negative_p = p # significance lvl as in Klaus_2011 (Statistical Analyses Support Power Law Distributions Found in Neuronal Avalanches) if ( negative_p < 0.01 ): strongly_rejected_opt = True print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.') print('Optimal fitting ') print('Number of samples xmin_optimal : ', data_optimal.shape[0]) print('params_optimal : ', params_optimal) print('xmin_optimal : ', xmin_optimal) print('ecdf(xmin_optimal) : ', round(ecdf(xmin_optimal)*100, 2), ' (%)') print('(p-val) kstest - tail only - xmin_optimal : ', round(p_val_optimal, 2)) print(' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .') print('strongly_rejected : ', 'True' if (strongly_rejected_opt) else 'False') print('As good as possible alternative (dist,p) : ', (negative_d, round(negative_p,5))) print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.') print('Relaxed fitting ') print('Number of samples xmin_relaxed : ', data_relaxed.shape[0]) print('params_relaxed : ', params_relaxed) print('xmin_relaxed : ', xmin_relaxed) print('ecdf(xmin_relaxed) : ', round(ecdf(xmin_relaxed)*100, 2), ' (%)') print('(p-val) kstest - tail only - xmin_relaxed : ', round(p_val_relaxed, 2)) print(' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .') relaxed_accept = 0 repeat_ = 2500 # See Clauset_2009 (Power-Law Distributions in Empirical Data) for _ in range(repeat_): synthetic_data = inv_ecdf(np.random.uniform(ecdf(np.min(data)), ecdf(np.max(data)), size=data.shape[0])) toCompare_with = inv_ecdf(np.random.uniform(ecdf(np.min(data)), ecdf(np.max(data)), size=data.shape[0])) # 1. optimal model: simulates the observed data with the ecdf up to xmin_optimal and then with the selected distro # 2. relaxed model: simulates the observed data with the ecdf up to xmin_relaxed and then with the selected distro # The reference model is the optimal one. optimal_model = np.copy(synthetic_data) inds = np.where(optimal_model >= xmin_optimal) optimal_model[inds] = distro.rvs(*params_optimal, size=inds[0].shape[0]) optimal_model = optimal_model[ (~np.isnan(optimal_model)) & (~np.isinf(optimal_model)) ] optimal_model = optimal_model[ optimal_model < (((top_wall - bottom_wall) - 5.0) / tau) ] D_opt = astats.kuiper_two(toCompare_with, optimal_model)[0] relaxed_model = np.copy(synthetic_data) inds = np.where(relaxed_model >= xmin_relaxed) relaxed_model[inds] = distro.rvs(*params_relaxed, size=inds[0].shape[0]) relaxed_model = relaxed_model[ (~np.isnan(relaxed_model)) & (~np.isinf(relaxed_model)) ] relaxed_model = relaxed_model[ relaxed_model < (((top_wall - bottom_wall) - 5.0) / tau) ] D_rel = astats.kuiper_two(toCompare_with, relaxed_model)[0] if (D_rel <= D_opt): relaxed_accept += 1 p_val_relaxed = round(relaxed_accept/repeat_, 2) print('p-val of relaxed model : ', p_val_relaxed) print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.') print('One-Zone Simulation for optimal model.') # int(4808*0.82): 4808 number of activated PLTs per ul (see Chopard_2017 - A physical description of the adhesion and aggregation of platelets). We deal with 0.82ul -> 4808*0.82 # tau is the time step of the random walks in ms # 820um is the height of Impact-R PLT function analyser (and thus the *0.82) PLTs_ = PLTs(int(4808*0.82), tau, 820.0, [{'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}], [{'distro':distro, 'params':params_optimal}], [xmin_optimal]) try: PLTs_.advance(int(20000/tau)) depositedPLTs_opt = int(PLTs_.depositedPLTs()/0.82) MSD_fitting_prms, distFromWalls_prms = PLTs_.meta_quantities() except: depositedPLTs_opt = 0 MSD_fitting_prms, distFromWalls_prms = (), () print('deposited PLTs (per uL) : ', depositedPLTs_opt) print('MSD non-linear fitting [um^2,ms] : ', MSD_fitting_prms) print("Avg Dist Walls linear fitting [um,ms] : ", distFromWalls_prms) print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.') print('One-Zone Simulation for relaxed model.') PLTs_ = PLTs(int(4808*0.82), tau, 820.0, [{'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}], [{'distro':distro, 'params':params_relaxed}], [xmin_relaxed]) try: PLTs_.advance(int(20000/tau)) depositedPLTs_rel = int(PLTs_.depositedPLTs()/0.82) MSD_fitting_prms, distFromWalls_prms = PLTs_.meta_quantities() except: depositedPLTs_rel = 0 MSD_fitting_prms, distFromWalls_prms = (), () print('deposited PLTs (per uL) : ', depositedPLTs_rel) print('MSD non-linear fitting [um^2,ms] : ', MSD_fitting_prms) print("Avg Dist Walls linear fitting [um,ms] : ", distFromWalls_prms) print('------------------------------------------------------------') print("#######################################################################") ####################################################################### ####################################################################### if (do_what == 'MFP'): avg_ = 0. for PLT in Bodies: avg_ += (abs(integrals_gT[PLT] - integrals_tau[PLT]) / abs(integrals_gT[PLT])) * 100. avg_ /= numBodies ground_truth_diff.append(avg_)
st.probplot(mdf.resid, plot=ax) plt.show() fig = plt.figure(figsize=(16, 9)) ax = sns.distplot(mdf.resid, hist=False, kde_kws={ "shade": True, "lw": 1 }, fit=st.norm) ax.set_xlabel("Residuals") plt.show() labels = ["Statistic", "p-value"] norm_res = st.shapiro(mdf.resid) for key, val in dict(zip(labels, norm_res)).items(): print(key, val) fig = plt.figure(figsize=(16, 9)) ax = sns.scatterplot(y=mdf.resid, x=mdf.fittedvalues) ax.set_xlabel("Fitted Values") ax.set_ylabel("Residuals") plt.show() het_white_res = het_white(mdf.resid, mdf.model.exog) labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"] for key, val in dict(zip(labels, het_white_res)).items(): print(key, val) fig = plt.figure(figsize=(16, 9))
def frequency_increment_test(time, values, clip=True): Y = frequency_increment_values(time, values, clip=clip) T, Tp = stats.ttest_1samp(Y, 0) W, Wp = stats.shapiro(Y) return {"T": T, "Tp": Tp, "W": W, "Wp": Wp}
for i, test_region in enumerate(test_regions): print(test_region) DATA_region = DATA_TS[(DATA_TS['Region'] == test_region) & (DATA_TS['Year'] < 2011) & (DATA_TS['Year'] > 1970)] climateData = np.array(DATA_region['Norm_ImpFix_2y_offset']) auto_corr = test_autocorrelation(climateData) if normClim is True: climateData = climateData / np.nanmax(climateData) t, shap_log = shapiro(np.log(climateData)) t, shap_norm = shapiro(climateData) best_model, y, x, maxi, pearson_corr, best_loo, loos, combs = find_best_model( climateData, telecon) comb_df = pd.DataFrame(combs) comb_df = comb_df.T loo_df = pd.DataFrame(loos) loo_df = loo_df.T loo_df.columns = ['log', 'identity', 'inverse-power'] loo_df['combination'] = comb_df.iloc[:, 0] # store LooCV out-of-sample-errors loo_df.to_csv(
# Analysis for Cluster 1: for i in cluster1: #Maximum flow algorithm: flow_value,flow_dict = nx.maximum_flow(G, 0, i, capacity='weight') c1_values.append(flow_value) df=pd.DataFrame({'Cluster':[1], 'Flow_Value':flow_value}) all_data=all_data.append(df) mean=np.mean(c1_values) std_dev=np.std(c1_values) normality_test=stats.shapiro(c1_values) print("Mean for Cluster 1:",mean) print("Standard deviation for Cluster 1:",std_dev) print("Normality test for Cluster 1:",normality_test,"\n") #Histogram for cluster 1: hist, bin_edges=np.histogram(c1_values,density=True) first_edge, last_edge = np.min(c1_values),np.max(c1_values) n_equal_bins = 15 bin_edges = np.linspace(start=first_edge, stop=last_edge,num=n_equal_bins + 1, endpoint=True) plt.hist(c1_values,bins=bin_edges,rwidth=0.75) plt.xlabel('Flow values') plt.ylabel('Frequency')
# # The simplest transformation is Standard Scaling (or Z-score normalization): # # $$ \large z= \frac{x-\mu}{\sigma} $$ # # Note that Standard Scaling does not make the distribution normal in the strict sense. # In[ ]: from sklearn.preprocessing import StandardScaler from scipy.stats import beta from scipy.stats import shapiro import numpy as np data = beta(1, 10).rvs(1000).reshape(-1, 1) shapiro(data) # In[ ]: # Value of the statistic, p-value shapiro(StandardScaler().fit_transform(data)) # With such p-value we'd have to reject the null hypothesis of normality of the data # But, to some extent, it protects against outliers: # In[ ]: data = np.array([1, 1, 0, -1, 2, 1, 2, 3, -2, 4, 100]).reshape(-1, 1).astype(np.float64) StandardScaler().fit_transform(data)
# print(data.head()) lm = ols(formula='percentual_k_unordered ~ algoritmo', data=data).fit() anova = sm.stats.anova_lm(lm, typ=2) # Type 2 ANOVA DataFrame tit = ' ANOVA para Probabilidade = %s e Tamanho = %s' % (prob, tam) hr = '=' * 60 #len(tit) anov = anova.head(10) s = '%s\n%s\n%s\n%s\n\n' % (hr, tit, hr, anov) arq_destino.write(s) print(s) #insere dados do Tete de Nomelidade s = ' * TESTE DE NORMALIDADE (SHAPIRO-WILK):\n' s += ' %s\n' % ('-' * (len(s) + 6)) for alg in udata.ALGORTIMOS: d = data[data['algoritmo'] == alg]['percentual_k_unordered'] W, p_value = stats.shapiro(d) s += ' - %s: W = %0.6f / p_value = %.6f \n' % (alg.ljust(9), W, p_value) s += '\n' arq_destino.write(s) print(s) #fecha arquivo arq_destino.close() # pr_f = anova['PR(>F)'].values[0] # print( '%s / %.55f' % (pr_f, pr_f) )
def runShapiroTest(data, alpha): stats, pValue = shapiro(data) print('Statistics: {} | pValue: {} | Is Parametric: {}'.format(stats, pValue, pValue > alpha)) return pValue > alpha
print(data.Age.describe()) # Calculates the z score of each value print(st.zscore([0.45, 23, 25, 28, 33, 60, 80])) # z score of a p-value and vice versa print(st.norm.cdf(3.46)) print(st.norm.ppf(.95)) print(st.norm.cdf(1.64)) # Normality test sm.qqplot(data.Age, line='45') pylab.show() data_no_missing = data.dropna() stat, p = st.shapiro(data_no_missing.Age) print('Statistics=%.3f, p=%.3f' % (stat, p)) alpha = 0.05 if p > alpha: print('Sample looks Gaussian (fail to reject H0)') else: print('Sample does not look Gaussian (reject H0)') # Embarked crosstab print(pd.crosstab(index=data["Embarked"], columns="Count")) print(data.Embarked.isnull().sum()) # Embarked barchart sns.countplot(x="Embarked", data=data) plt.show()
for A, B in zip(sampleA, sampleB): fdata.write('\n' + A + ' is sample A' + '\n' + B + ' is sample B' + '\n') fdata.write('RVL comparaison' + '\n') fdata.write( str(np.mean(RVLdata_[A])) + '+-' + str(np.std(RVLdata_[A])) + '\n') fdata.write( str(np.mean(RVLdata_[B])) + '+-' + str(np.std(RVLdata_[B])) + '\n') print 'means' ###T-test pops = [] pops.append(RVLdata_[A]) pops.append(RVLdata_[B]) ###Shapiro's test for normality for sample A w, pnormA = stats.shapiro(np.array(RVLdata_[A])) if pnormA > 0.05: normA = True print 'A sample IS normally distributed' fdata.write('A sample IS normally distributed' + '\n') else: normA = False print 'A sample is NOT normally distributed' fdata.write('A sample is NOT normally distributed' + '\n') ###Shapiro's test for normality for sample B w, pnormB = stats.shapiro(np.array(RVLdata_[B])) if pnormB > 0.05: normB = True print 'B sample IS normally distributed' fdata.write('B sample IS normally distributed' + '\n') else:
from scipy.stats import shapiro import matplotlib.pyplot as plt from pandas.core.frame import DataFrame data = np.genfromtxt("data.csv", delimiter=",") listp = [] listw = [] listf = [] listr = [ "Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr" ] for i in range(0, 20): aa = data[1:, i] fig = plt.figure res = stats.probplot(aa, plot=plt) plt.show() w, p = shapiro(aa) listw.append(w) listp.append(p) if p >= 0.05: print("normal distribution") listf.append("normal distribution") else: print("abnormal distribution") listf.append("abnormal distribution") print("w:%f" % w, "p.value:%f" % p) dic = {"residue index": listr, "W": listw, "P.value": listp, "F": listf} output = DataFrame(dic) print(output)
# for col in train_num.columns: # train_num[col].plot.hist(title = col) # s = train_num.describe()[col].to_string() + \ # "\nMissing Values: " + str(train_num.isnull().sum()[col]) + \ # "\nMissing Values %: " + str(round(train_num.isnull().sum()[col]/len(train_num),4)) # plt.figtext(1, 0.5, s) # plt.show() droped_ttest_cols = [] # * Evaluar normalidad "skewness" target = train_temp[label] t_sel = [0] * len(train_num.columns) # señala qué variables pueden ayudar a predecir target t_ctr = 0 # contador for col in train_num.columns: # Shapiro-Wilk test stat, p = shapiro(train_num[col]) #print('Statistics={:.3f}, p={:.3f}'.format(stat, p)) if p > 0.05: # no se rechaza la H0 según la cual la distribución de estos datos es similar a la gaussiana # t-test # print(col) # separación de datos según la aceptación del crédito t0 = train_num[col][target == 0] t1 = train_num[col][target == 1] stat, p = ttest_ind(t0, t1, nan_policy = "omit", equal_var = False) # print('T-statistic={:.3f}, p={:.3f}'.format(stat, p)) if p < 0.05: # se rechaza la H0 según la cual las medias de t0 y t1 no difieren significativamente t_sel[t_ctr] = 1 else: droped_ttest_cols.append(col)
plt.figure() plt.plot(x, gNorm, 'r-', label='Norm PDF') plt.plot(x, sNorm, 'g-', label='Skewed Norm PDF') #plt.plot(x, chiSq,'m-', label='Chi-Square PDF') plt.bar(bin_edges[:-1], hist, width=(max(bin_edges) - min(bin_edges)) / iters) plt.title('NRMSE distribution') plt.xlim(np.min(x), np.max(x)) plt.legend() # Print Normality Test results (Variable and p-value) # Kolmogorov-Smirnov print(stats.kstest(y, 'norm', args=(mean, var))) print(stats.kstest(y, 'skewnorm', args=(smean, svar, sk))) # Shapiro-Wilk print(stats.shapiro(y)) # Chi-Square #print(stats.chisquare(hist, gNorm)) #print(stats.chisquare(hist, sNorm)) # Q-Q plots #f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2) f, (ax1, ax2) = plt.subplots(1, 2) plt.title("Q-Q Plots") res = stats.probplot(y, dist=stats.norm(mean, var), plot=ax1) ax1.set_title("Normality Test (Non-Skewed)") resS = stats.probplot(y, dist=stats.skewnorm(smean, svar, sk), plot=ax2) ax2.set_title("Normality Test (Skewed)") #resX2 = stats.probplot(y, dist=stats.chi2(4), plot=ax3) #ax3.set_title("Chi-Square Test (k=4)") #resX2 = stats.probplot(y, dist=stats.chi2(10), plot=ax4) #ax4.set_title("Chi-Square Test (k=10)")
#bins = 10 print("Bins No (Sturge’s Rule): ", bins) plt.hist(scores, bins=bins) plt.ylabel('Probability') plt.xlabel("Accuracy") plt.title("Accuracy of " + model_label + " with CV=" + str(cv)) plt.show() qqplot(scores, line='s') plt.title("Accuracy of " + model_label + " with CV=" + str(cv)) plt.show() alpha = 0.05 print("Shapiro-Wilk Test result:") stat, p = shapiro(scores) print(' Statistics=%.3f, p=%.3f, alpha=%.3f' % (stat, p, alpha)) if p > alpha: print(' Sample looks Gaussian (fail to reject H0)') else: print(' Sample does not look Gaussian (reject H0)') print("D’Agostino’s K^2 Test result:") stat, p = normaltest(scores) print(' Statistics=%.3f, p=%.3f, alpha=%.3f' % (stat, p, alpha)) if p > alpha: print(' Sample looks Gaussian (fail to reject H0)') else: print(' Sample does not look Gaussian (reject H0)') print("Anderson-Darling Test result:")
# Example of the Shapiro-Wilk Normality Test from scipy.stats import shapiro data = [ 13.83, 14.47, 14.03, 15.46, 15.61, 13.6, 15.26, 14.13, 14.41, 13.7, 14.23, 14.49, 14.0, 13.73, 13.92, 13.82, 13.81, 13.88, 13.71, 14.08, 14.1, 13.38, 13.69, 13.56, 13.57, 13.63, 13.59, 13.64, 13.97, 13.29, 13.72 ] stat, p = shapiro(data) print('stat=%.3f, p=%.3f' % (stat, p)) f = open("NormalTestShapiro-Wilk.txt", "a") if p > 0.05: print('Probably Gaussian') f.write('Probably Gaussian\n') else: print('Probably not Gaussian') f.write('Probably not Gaussian\n') f.write("Stat: {0}s and p: {1}s\n".format(stat, p)) f.close()
def get_statistic_and_pvalue(self, y): return shapiro(y)