def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0): ''' A function which takes two lists of values of equal length with corresponding entries and returns a dict containing a variety of metrics. :param x_values: A list of values for the X-axis (experimental values). :param y_values: A list of values for the X-axis (predicted values). :param fcorrect_x_cutoff: See get_xy_dataset_statistics. :param fcorrect_y_cutoff: See get_xy_dataset_statistics. :param x_fuzzy_range: See get_xy_dataset_statistics. :param y_scalar: See get_xy_dataset_statistics. :return: A table of statistics. ''' from scipy.stats import pearsonr, spearmanr, normaltest, ks_2samp, kstest, norm assert(len(x_values) == len(y_values)) return dict( pearsonr = pearsonr(x_values, y_values), spearmanr = spearmanr(x_values, y_values), gamma_CC = gamma_CC(x_values, y_values), MAE = mae(x_values, y_values), normaltestx = normaltest(x_values), normaltesty = normaltest(y_values), kstestx = kstest(x_values, 'norm'), kstesty = kstest(y_values, 'norm'), ks_2samp = ks_2samp(x_values, y_values), fraction_correct = fraction_correct(x_values, y_values, x_cutoff = fcorrect_x_cutoff, y_cutoff = fcorrect_y_cutoff), fraction_correct_fuzzy_linear = fraction_correct_fuzzy_linear(x_values, y_values, x_cutoff = fcorrect_x_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar), )
def motifStats(data,motifSize,degree, usetotal=False): for corr in ('corr','lcorr','lacorr'): motifsNL = findMotifs(data,('NL',corr), motifSize, degree, usetotal) motifsMCI = findMotifs(data,('MCI',corr), motifSize, degree, usetotal) motifsAD = findMotifs(data,('AD',corr), motifSize, degree, usetotal) allMotifs = list(set(motifsNL.keys()) | set(motifsAD.keys()) | set(motifsMCI.keys())) datatype = "Total" if usetotal else "Percent" filename = "result2/{}_ks-stats_size-{}_deg-{}.txt".format(corr+datatype,motifSize,degree) with open(filename,'w') as f: f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}\n".format('ID','MCI','AD','NORM NL','NORM MCI','NORM AD')) for key in allMotifs: NLdata = motifsNL.get(key,np.zeros(88)) MCIdata = motifsMCI.get(key,np.zeros(88)) ADdata = motifsAD.get(key,np.zeros(88)) KSstatistic, MCIpvalue = stats.ks_2samp(MCIdata,NLdata) KSstatistic, ADpvalue = stats.ks_2samp(ADdata,NLdata) k2,NLnorm = stats.normaltest(NLdata) k2,MCInorm = stats.normaltest(MCIdata) k2,ADnorm = stats.normaltest(ADdata) if MCIpvalue<0.01 or ADpvalue<0.01: line = "*{0:>9}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n" else: line = "{0:>10}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n" f.write(line.format(str(int(key)),MCIpvalue,ADpvalue,NLnorm,MCInorm,ADnorm))
def check_normality(): '''Check if the distribution is normal.''' # Generate and show a distribution numData = 100 # To get reproducable values, I provide a seed value np.random.seed(987654321) data = stats.norm.rvs(myMean, mySD, size=numData) plt.hist(data) plt.show() # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. stats.normaltest(data) # Or you can check for normality with Kolmogorov-Smirnov test _,pVal = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') if pVal > 0.05: print('Data are normally distributed') return pVal
def check_normality(): '''Check if the distribution is normal.''' # Are the data normally distributed? numData = 100 data = stats.norm.rvs(myMean, mySD, size=numData) stats.normaltest(data) _ = stats.probplot(data, plot=plt) show()
def TestNormality(self,array): array = np.array(array) print stats.normaltest(array) if array[1] <0.2: print "Unlikely to be normally distributed" return False else: print "The dataset is likely to be normally distributed" return True
def test_convert_uniform_column_to_normal(miner_df): logcf = lambda row, x: norm.logpdf(x[0], 0, 1) miner = MInER(miner_df, logcf, ['x_2'], n_models=2, use_mp=False) miner.init_models() miner.fit(20, 10) assert(not np.any(np.isnan(miner._df['x_2'].values))) assert(not np.any(np.isnan(miner._df['x_3'].values))) assert(normaltest(miner._df['x_2'])[1] > .05) assert(normaltest(miner._df['x_3'])[1] < .05)
def determine_significance(mesa1, mesa2): """ Determines if two sets of values are statistically significant. In the best case, we can determine a normal distribution, and equal variance. Once determined we can use the independent t-test function if the values are of equal variance. If we have normal data, but the variance is unequal, the welch t-test is used. http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances In the case where we cannot determine normality the mann-whitney u-test is desired to be used, but this test is only effective when there are greater than 20 samples. http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test """ # FIXME: Is it possible to determine these things with fewer samples? Distribution = Enum('Distribution', 'Normal, Non_normal Unknown') normality = Distribution.Normal try: k2, normal = stats.normaltest(mesa1) # FIXME: Unhardcode if (normal < NORMAL_CI): normality = Distribution.Non_normal k2, normal = stats.normaltest(mesa2) if (normal < NORMAL_CI): normality = Distribution.Non_normal except ValueError: normality = Distribution.Unknown equal_variance = is_equal_variance(mesa1, mesa2) if args.ttest: t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance) return (p, normality == Distribution.Normal, "t-test" if equal_variance else "Welch's") elif args.mannwhitney: u, p = stats.mannwhitneyu(mesa1, mesa2) p *= 2 # We want a 2-tailed p-value return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney") if normality == Distribution.Normal: error_handler='raise' if np.var(mesa1) == 0 and equal_variance: error_handler='ignore' with np.errstate(divide=error_handler): t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance) return (p, False, "t-test" if equal_variance else "Welch's") else: u, p = stats.mannwhitneyu(mesa1, mesa2) p *= 2 # We want a 2-tailed p-value flawed = len(mesa1) < 20 or len(mesa2) < 20 return (p, flawed, "Mann-Whitney")
def check_normality(): '''Check if the distribution is normal.''' # Set the parameters numData = 1000 myMean = 0 mySD = 3 # To get reproducable values, I provide a seed value np.random.seed(1234) # Generate and show random data data = stats.norm.rvs(myMean, mySD, size=numData) fewData = data[:100] plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() pFewVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['Omnibus'] = stats.normaltest(data) _, pFewVals['Omnibus'] = stats.normaltest(fewData) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData) # Or you can check for normality with Lilliefors-test _, pVals['Lilliefors'] = lillifors(data) _, pFewVals['Lilliefors'] = lillifors(fewData) # Alternatively with original Kolmogorov-Smirnov test _, pVals['Kolmogorov-Smirnov'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm') print('p-values for all {0} data points: ----------------'.format(len(data))) print(pVals) print('p-values for the first 100 data points: ----------------') print(pFewVals) if pVals['Omnibus'] > 0.05: print('Data are normally distributed') # --- >>> STOP stats <<< --- return pVals['Kolmogorov-Smirnov']
def normalityTests(data): arr = N.zeros((data.shape[1]+1,data.shape[0]+1),N.object) mergeCTOA = concatenateRT(data.copy(), axis=0) mergeCTD = concatenateRT(data.copy(), axis=1) for i in range(data.shape[0]): for j in range(data.shape[1]): arr[j,i] = normaltest(data[i,j]) for i, grp in enumerate(mergeCTOA): arr[-1,i] = normaltest(grp) for i, grp in enumerate(mergeCTD): arr[i,-1] = normaltest(grp) arr[-1,-1] = normaltest(N.hstack(data.flatten())) return arr
def arima_handler(dta, start, end): #dta, x = data.dataHandler('./tmpfile00431',0.5) dta = pd.TimeSeries(dta) #dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700','2060')) dta.index = pd.Index(sm.tsa.datetools.dates_from_range(start,end)) dta.plot(figsize=(12,8)) fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2) arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit() #print(arma_mod20) arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit() #print(arma_mod30) print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic) print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic) if arma_mod20.aic < arma_mod30.aic: sm.stats.durbin_watson(arma_mod20.resid.values) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) ax = arma_mod20.resid.plot(ax=ax); resid = arma_mod20.resid stats.normaltest(resid) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) fig = qqplot(resid, line='q', ax=ax, fit=True) fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2) r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True) data = np.c_[range(1,41), r[1:], q, p] #table = pandas.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"]) table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"]) #print(table.set_index('lag')) predict_sunspots = arma_mod20.predict(str(string.atoi(start)+360),str(string.atoi(end)+5), dynamic=True) #print(predict_sunspots) return predict_sunspots
def calc_correlation(fname): reader = csv.reader(open(fname,"rb"),delimiter='\t') next(reader) x = list(reader) data = np.array(x).astype('float') normal_a = stats.normaltest(data[:,0])[1] normal_b = stats.normaltest(data[:,1])[1] if (normal_a >= 0.05) & (normal_b >= 0.05): # both series are normally distributed return stats.pearsonr(data[:,0], data[:,1])[0] else: # not normally distributed return stats.spearmanr(data[:,0], data[:,1])[0]
def inspect_output_by_filter(self,rez,dat,doplot=False,test=False, sig_clips=[5, 3, 2], sig_test=[False,False,True]): p = rez.values()[0][1] myoutput = rez.values()[0][0] new = rez.values()[0][2] filt = rez.keys()[0] ret = {} ret.update({"all": self._extract_info(p,myoutput.sd_beta,myoutput)}) err = dat[2] tmp = (dat[1] - self.modelfunc_small_te(p,dat[0]))/err dof = tmp.shape[0] - myoutput.beta.shape[0] chisq = (tmp**2).sum() ret['all'].update({"ndata": dat[0].shape[0], \ "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof), "normalcy_prob": normaltest(tmp)[1]}) for s in enumerate(sig_clips): if sig_test[s[0]] and not test: continue sig = s[1] # get the indices of those inside and out of the clip area tmpisig = (abs(tmp) < sig).nonzero()[0] tmpisige = (abs(tmp) > sig).nonzero()[0] frac_less_than_sig = float(tmpisig.shape[0])/dat[0].shape[0] # print frac_less_than_sig if frac_less_than_sig < 1.0: out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\ filt,do_sim=False,vplot=False) p = out[1] myoutput = out[0] t = "-test" if sig_test[s[0]] else "" ret.update({"sig" + str(sig) + t: self._extract_info(p,myoutput.sd_beta,myoutput)}) tmp = (dat[1][tmpisig] - self.modelfunc_small_te(p,dat[0][tmpisig]))/err[tmpisig] dof = tmp.shape[0] - myoutput.beta.shape[0] chisq = (tmp**2).sum() try: ntest = normaltest(tmp)[1] except: ntest = 0.0 ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \ "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof), "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig }) if doplot: plot(dat[0][tmpisige],dat[1][tmpisige],".") return ret
def test_maskedarray_input(self): # Add some masked values, test result doesn't change x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2 xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True]) assert_allclose(mstats.normaltest(xm), stats.normaltest(x)) assert_allclose(mstats.skewtest(xm), stats.skewtest(x)) assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
def replace_outs(df, numOuts, df_outs_ind): """ This has been replaced with "replace_outs2" """ df_out = df.copy() out_row_inds, out_col_inds = np.random.randint(0, len(df_outs_ind.index), numOuts), \ np.random.randint(0, len(df_outs_ind.columns), numOuts) for row, col in zip(out_row_inds, out_col_inds): array_col = df.iloc[:, col].dropna() z_score, p_val = stats.normaltest(array_col) if p_val > 0.05: # this means the distribution is normal eps = 0.002 * np.random.random_sample(1) - 0.001 # epsilon is a random float in [-0.001, 0.001] # *** this threshold should be set in experiments df_out.iloc[row, col] = 3 * df.iloc[:, col].std() + eps # print("for row {0} and column {1} we have {2} and real val is {3}".format(row, col, df_out.iloc[row, col], df_in.iloc[row, col])) df_outs_ind.iloc[row, col] = 1 else: q1, q3, iqr = tukey_vals(array_col) tukeyHL = [array_col.mean() + q3 + (3 * iqr), array_col.mean() - q1 - (3 * iqr)] df_out.iloc[row, col] = rnd.sample(tukeyHL, 1) df_outs_ind.iloc[row, col] = 1 return df_out, df_outs_ind
def check_normality(): '''Check if the distribution is normal.''' # Generate and show a distribution numData = 100 # To get reproducable values, I provide a seed value np.random.seed(987654321) data = stats.norm.rvs(myMean, mySD, size=numData) plt.hist(data) plt.show() # --- >>> START stats <<< --- # Graphical test: if the data lie on a line, they are pretty much # normally distributed _ = stats.probplot(data, plot=plt) plt.show() pVals = pd.Series() # The scipy normaltest is based on D-Agostino and Pearsons test that # combines skew and kurtosis to produce an omnibus test of normality. _, pVals['omnibus'] = stats.normaltest(data) # Shapiro-Wilk test _, pVals['Shapiro-Wilk'] = stats.shapiro(data) # Or you can check for normality with Lilliefors-test ksStats, pVals['Lilliefors'] = kstest_normal(data) # Alternatively with original Kolmogorov-Smirnov test _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm') print(pVals) if pVals['omnibus'] > 0.05: print('Data are normally distributed')
def gStats(self, missingValue=0.0): """dict of {geneID: (min,max,mean,median,std,stderr, Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...} """ import scipy as S import scipy.stats as SS rv = {} for k, v in self.items(): # print k,v va = S.array(self.gValues(k, missingValue)) try: normaltest = SS.normaltest(va) except: normaltest = None try: shapiro = SS.shapiro(va) except: shapiro = None try: rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro) except: print k, va raise return rv
def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min())/2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def main(argv=sys.argv): route = Route() route.trace(argv[1]) drtts = [] # Normal Test # for ttl, hop in route.hops.items(): drtts.append(hop.deltaRTTi) normal = stats.normaltest(drtts) print("** NormalTest **") print("k2: ", normal[0], " p-valor: ", normal[1]) # Test de Grubbs # zscores = calculateZScore(drtts) N = len(drtts) sampleMean = calculateAverage(drtts) standarDeviation = calculateStandardDeviation(drtts) # Estadistico G = (max(drtts) - sampleMean) / standarDeviation criticalValue = tDistribution[N] print("** GrubbsTest **") print("N: ", N) print("G: ", G) print("CriticalValue: ", criticalValue) if criticalValue != None and G > criticalValue: print("El DeltaRTT ", max(drtts), " es el enlace transatlantico")
def oneGroup(): '''Test of mean value of a single set of data''' print('Single group of data =========================================') # First get the data data = np.array([5260, 5470, 5640, 6180, 6390, 6515, 6805, 7515, 7515, 8230, 8770], dtype=np.float) checkValue = 7725 # value to compare the data to # 4.1.1. Normality test # We don't need the first parameter, so we just assign the output to the dummy variable "_" (_, p) = stats.normaltest(data) if p > 0.05: print('Data are distributed normally, p = {0}'.format(p)) # 4.1.2. Do the onesample t-test t, prob = stats.ttest_1samp(data, checkValue) if prob < 0.05: print('With the one-sample t-test, {0:4.2f} is significantly different from the mean (p={1:5.3f}).'.\ format(checkValue, prob)) else: print('No difference from reference value with onesample t-test.') # 4.1.3. This implementation of the Wilcoxon test checks for the "difference" of one vector of data from zero (_,p) = stats.wilcoxon(data-checkValue) if p < 0.05: print('With the Wilcoxon test, {0:4.2f} is significantly different from the mean (p={1:5.3f}).'.\ format(checkValue, p)) else: print('No difference from reference value with Wilcoxon rank sum test.')
def omni_normtest(resids, axis=0): """ Omnibus test for normality Parameters ---------- resid : array-like axis : int, optional Default is 0 Returns ------- Chi^2 score, two-tail probability """ # TODO: change to exception in summary branch and catch in summary() # behavior changed between scipy 0.9 and 0.10 resids = np.asarray(resids) n = resids.shape[axis] if n < 8: from warnings import warn warn("omni_normtest is not valid with less than 8 observations; %i " "samples were given." % int(n), ValueWarning) return np.nan, np.nan return stats.normaltest(resids, axis=axis)
def test_disk_distribution(diskclass, diskpar, n_expected): '''This is a separate test from test_disk_radius, because it's a simpler to write if we don't have to worry about the inner hole. For the test itself: The results should be poisson distributed (or, for large numbers this will be almost normal). That makes testing it a little awkard in a short run time, thus the limits are fairly loose. This test is run for several extended sources, incl Gaussian. Stirctly speaking it should fail for a Gaussian distribution, but if the sigma is large enough it will pass a loose test (and still fail if things to catastrophically wrong, e.g. some test circles are outside the source). ''' s = diskclass(coords=SkyCoord(213., -10., unit=u.deg), **diskpar) photons = s.generate_photons(1e5) n = np.empty(20) for i in range(len(n)): circ = SkyCoord((213. + np.random.uniform(-0.1, .1)) * u.degree, (- 10. + np.random.uniform(-0.1, .1)) * u.degree) d = circ.separation(SkyCoord(photons['ra'], photons['dec'], unit='deg')) n[i] = (d < 5. * u.arcmin).sum() s, p = normaltest(n) # assert a p value here that is soo small that it's never going to be hit # by chance. assert p > .05 # better: Test number of expected photons matches # Allow large variation so that this is not triggered by chance assert np.isclose(n.mean(), n_expected, rtol=.2)
def gof(self, x, y, ye): ''' Computes GoF test statistics and other diagnostical tests Returns: -------- - GoF test: Chi^2, p-value, and ddof - Normality of residuals: K^2 and p-value ''' res = {} resid = y - self(x) chisq = np.sum(((resid) / ye) ** 2) ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters chisq_pvalue = chisqprob(chisq, ddof) gof = (chisq, chisq_pvalue, ddof) resid = normaltest(resid) ym = y.mean() SStot = np.sum((y - ym) ** 2) SSerr = np.sum((y - self(x)) ** 2) Rsquared = 1.0 - SSerr / SStot # Besides being buggy, this test for homoscedasticity is supposed to work only # for linear regressions, hence is not suited for our case, but I'll keep it # here until I figure out an alternative. Remember to uncomment the import for # OLS ontop. # regresults = OLS(resid ** 2, np.c_[x, x**2]).fit() # LM =regresults.rsquared # LM_pvalue = chisqprob(LM, len(x) - ddof) # white = (LM, LM_pvalue) # return gof, resid, white return gof, resid, Rsquared
def pairedt(pairs, numSamples): results = dict() t,v = pairs.items() diffs = [t[1][x] - v[1][x] for x in range(len(t[1]))] plotDiffs(diffs) sampleSize = int(len(diffs)/numSamples) indices = range(len(diffs)) random.shuffle(indices) mean_diffs = [] i = 0 for sample in range(numSamples): total_diff = 0 for x in range(sampleSize): index = indices[i] total_diff += diffs[index] i+=1 sample_avg = total_diff/float(sampleSize) mean_diffs.append(sample_avg) #normality check nt = stats.normaltest(mean_diffs) results['normal_p'] = format(round(nt[1],4)) #ttest t_prob = stats.ttest_1samp(mean_diffs, 0) results['ttest_t'] = format(round(t_prob[0],4)) results['ttest_p'] = format(round(t_prob[1],4)) #other stats results['avg_diff'] = format(round(np.mean(diffs),4)) results['numSamples'] = numSamples results['sampleSize'] = sampleSize results['num_pairs'] = len(pairs['tor']) return results
def pearson_or_shapiro(data): """pearson_or_shapiro Use D'agostino/Pearson if possible (n >= 20), else Shapiro :param data: """ return stats.normaltest(data) if len(data) >= 20 else stats.shapiro(data)
def normality_check(feature_group,output_path): if feature_group.isEmpty(): return False normal_flag = True sk_test = stats.skewtest(feature_group.get_scores()) kr_test = stats.kurtosistest(feature_group.get_scores()) normaltest = stats.normaltest(feature_group.get_scores()) temp = ''' Normality Test P-Values ------------------------------------ Kurtosis | {0} Skewness | {1} NormalTest | {2} ''' result = temp.format(kr_test[1],sk_test[1],normaltest[1]) print result tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05) return tests
def testNormalByWord( word, version ): # Let's find the data - first the word for i in range( len(data) ): if data[i][0][1][0][1] == word: thisWord = i elif data[i][1][1][0][1] == word: thisWord = i # Now the version # Get the distribution if data[thisWord][0][0][1] == version: numbers = data[thisWord][0][1][1] elif data[thisWord][1][0][1] == version: numbers = data[thisWord][1][1][1] # Use scipy to check normality ( chi, p ) = stats.normaltest( numbers ) print "Chi-squared: " + str( chi ) print "P-value: " + str( p ) if p < 0.05: print "Not normal with alpha 0.05" else: print "Normal with alpha = 0.05"
def omni_normtest(resids, axis=0): """ Omnibus test for normality Parameters ----------- resid : array-like axis : int, optional Default is 0 Returns ------- Chi^2 score, two-tail probability """ #TODO: change to exception in summary branch and catch in summary() #behavior changed between scipy 0.9 and 0.10 resids = np.asarray(resids) n = resids.shape[axis] if n < 8: return np.nan, np.nan return_shape = list(resids.shape) del return_shape[axis] return np.nan * np.zeros(return_shape), np.nan * np.zeros(return_shape) raise ValueError( "skewtest is not valid with less than 8 observations; %i samples" " were given." % int(n)) return stats.normaltest(resids, axis=axis)
def fillMissing1(df, dataType): ''' Args: df ( 2d array/ Dict): eg : ('attribute1': [12, 24, 25] , 'attribute2': ['good', 'bad']) dataTypes (dict): Dictionary of attribute names of df as keys and values 0/1 indicating categorical/continuous variable eg: ('attribute1':1, 'attribute2': 0) Returns: writes a file with missing values replaces. ''' dataLabels = list(df.columns.values) for eachlabel in dataLabels: if dataType[eachlabel] is 1: # check if data is normal _,pval = stats.normaltest(df[eachlabel]) if(pval < 0.5): # if the data is not normal use median of the group to replace the missing df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.median())) else: # if the data is not normal use mean of the group to replace the missing df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.mean())) else: #for categorical data use mode ( the most frequent value ) to replace the missing df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.mode()[0])) df.to_csv(Globals.MISSING_REPLACED_FILE) return df, Globals.MISSING_REPLACED_FILE
def normal_test(features, **_): """ :param features: :param _: :return: """ return stats.normaltest(features)
def test_normaltest(self): for n in self.get_n(): if n > 8: x,y,xm,ym = self.generate_xy_sample(n) r = stats.normaltest(x) rm = stats.mstats.normaltest(xm) assert_almost_equal(r[0],rm[0],10) assert_almost_equal(r[1],rm[1],10)
def perform_tests(weekends, weekdays): weekends_normaltest = stats.normaltest(weekends['comment_count']) weekdays_normaltest = stats.normaltest(weekdays['comment_count']) levene = stats.levene(weekends['comment_count'], weekdays['comment_count']) ttest = stats.ttest_ind(weekends['comment_count'], weekdays['comment_count']) return weekends_normaltest, weekdays_normaltest, levene, ttest
def omni(self): """ Omnibus test for normality """ return stats.normaltest(self.e)
sol.components_ # to get the answer, so the value of each weight in the components sol.explained_variance_ # the explained variance sol.explained_variance_ratio_ # the expalined variance in % # A bit of classical statistics: from scipy import stats X1 = np.random.random( (20)) # let's create 2 variables with 20 observations each X2 = np.random.random((20)) X1_stand = stats.zscore(X1) # Another way to standardize X2_stand = stats.zscore(X2) stats.sem(X1) # standard error of the mean stats.normaltest(X2) # test for normality stats.chisquare( [12, 14, 16, 18, 10, 10] ) # chisquare (each entry represents the category and how many times they appear) stats.rankdata(X1) # rank the data, useful for non parametric tests stats.ttest_ind(X1, X2) # independant t test stats.ttest_rel(X1, X2) # dependant t test stats.mannwhitneyu(X1, X2) # Mann Whitney U test (non parametric) stats.wilcoxon(X1, X2) # Wilcoxon test (non parametric) stats.spearmanr(X1, X2) # spearman correlation stats.linregress(
# -*- coding: utf-8 -*- """ Created on Sun Dec 21 20:36:32 2014 @author: JN """ import pandas as pd import statsmodels.api as sm import pylab as pl import scipy.stats as stats Raw_data = pd.read_csv('C:/Users/JN/Desktop/AnovaData.csv') print Raw_data.describe() Raw_data.hist() pl.show() print Raw_data.BPM print stats.normaltest(Raw_data.BPM) New_Column = ['RSP_Cycle', 'BPM'] New_Raw_Data = Raw_data[New_Column] print New_Raw_Data print stats.mstats.kruskalwallis(New_Raw_Data)
import scipy.stats as stats import numpy as np data = pd.read_csv("data.csv", encoding="gbk") col1 = data[u'2013年GDP(亿元)'] col2 = data[u'较2012年实际增长率'].dropna().map(lambda x: float(x[:-1])) sc = (col2 - np.mean(col2)) / np.std(col2) # powerlaw test fit = powerlaw.Fit(col1) R, p = fit.distribution_compare('power_law', 'lognormal') print 'R', R, 'p', p print "power_law fit is wrong than to lognormal!" fig4 = fit.plot_ccdf(linewidth=2) fit.power_law.plot_ccdf(ax=fig4, color='r', linestyle='--') fit.lognormal.plot_ccdf(ax=fig4, color='g', linestyle='--') plt.show() # norm test des = stats.describe(col2) omnibus, p_n = stats.normaltest(col2) print 'p', p_n, 'it is not a norm distribution however' plt.hist(col2) plt.show()
def isInt(s): try: int(s) return True except ValueError: return False if __name__ == "__main__": argFiles = [] nameFiles = [] values = [] result = 0 results = [] nameFiles = os.listdir(sys.argv[1]) nameFiles.remove("source.py") with open(os.path.join(sys.argv[1], nameFiles[0]), 'r') as f: arg = f.readline() values = arg.split(' ') if (len(values) >= 20): for i in range(len(values)): results.append(float(values[i])) statist, hi_2 = stats.normaltest(results) with open(os.path.join(sys.argv[1], str(1) + "output.txt"), 'w') as f: f.write(str(hi_2)) else: exit(-1)
def dAgostinaTest(data): print(len(data)) stat, p = normaltest(data) print(p)
'bias_r': right_fit['bias'], 'lapselow_l': left_fit['lapselow'], 'lapselow_r': right_fit['lapselow'], 'lapsehigh_l': left_fit['lapsehigh'], 'lapsehigh_r': right_fit['lapsehigh'], 'nickname': nickname, 'lab': lab}) biased_fits = biased_fits.append(fits, sort=False) # %% Statistics stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value']) posthoc_tests = {} for i, var in enumerate(['threshold_l', 'threshold_r', 'lapselow_l', 'lapselow_r', 'lapsehigh_l', 'lapsehigh_r', 'bias_l', 'bias_r']): _, normal = stats.normaltest(biased_fits[var]) if normal < 0.05: test_type = 'kruskal' test = stats.kruskal(*[group[var].values for name, group in biased_fits.groupby('lab')]) if test[1] < 0.05: # Proceed to posthocs posthoc = sp.posthoc_dunn(biased_fits, val_col=var, group_col='lab') else: posthoc = np.nan else: test_type = 'anova' test = stats.f_oneway(*[group[var].values for name, group in biased_fits.groupby('lab')]) if test[1] < 0.05: posthoc = sp.posthoc_tukey(biased_fits, val_col=var, group_col='lab')
def make2ds(args): ifiles = args.ifiles if len(args.figure_keywords) > 0: plt.setp(fig, **args.figure_keywords) if len(args.axes_keywords) > 0: plt.setp(ax, **args.axes_keywords) nborders = len(ax.collections) for fi, ifile in enumerate(ifiles): variables = args.variables if variables is None: variables = [ key for key, var in ifile.variables.items() if var.ndim == 2 ] if len(variables) == 0: raise ValueError( 'Unable to heuristically determin plottable variables; use -v to specify variables for plotting' ) for varkey in variables: var = ifile.variables[varkey] vals = var[:] if args.squeeze: vals = vals.squeeze() if args.normalize is None: from scipy.stats import normaltest vmin, vmax = vals.min(), vals.max() if normaltest(vals.ravel())[1] < 0.05: cvals = np.ma.compressed(vals) boundaries = np.percentile(cvals, np.arange(0, 110, 10)) warn( 'Autoselect deciles colormap of %s; override width --norm' % varkey) else: boundaries = np.linspace(vmin, vmax, num=11) warn( 'Autoselect linear colormap of %s; override width --norm' % varkey) if (boundaries.max() / np.ma.masked_values(boundaries, 0).min()) > 10000: formatter = LogFormatter(labelOnlyBase=False) else: formatter = None norm = BoundaryNorm(boundaries, ncolors=256) else: norm = eval(args.normalize) formatter = None if not args.colorbarformatter is None: try: formatter = eval(args.colorbarformatter) except: formatter = args.colorbarformatter vmin, vmax = vals.min(), vals.max() if not norm.vmin is None: vmin = norm.vmin if not norm.vmax is None: vmax = norm.vmax varunit = getattr(var, 'units', 'unknown').strip() vardims = [ dk for dk, dv in zip(var.dimensions, var.shape) if dv != 1 ] print(varkey, sep='') del ax.collections[nborders:] if args.swapaxes: patches = ax.pcolor(vals.T, norm=norm) ax.set_xlabel(vardims[0]) ax.set_ylabel(vardims[1]) else: patches = ax.pcolor(vals, norm=norm) ax.set_xlabel(vardims[1]) ax.set_ylabel(vardims[0]) height = vals.shape[0] width = vals.shape[1] if width >= height: orientation = 'horizontal' else: orientation = 'vertical' try: cax = cbar.ax cax.cla() except: cax = None if vals.max() > vmax and vals.min() < vmin: extend = 'both' elif vals.max() > vmax: extend = 'max' elif vals.min() < vmin: extend = 'min' else: extend = 'neither' cbar = fig.colorbar(patches, orientation=orientation, cax=cax, extend=extend, format=formatter) del cbar.ax.texts[:] cbar.set_label(varkey + ' (' + varunit + '; min=%.3g; max=%.3g)' % (var[:].min(), var[:].max())) # if orientation == 'vertical': # cbar.ax.text(.5, 1.05, '%.3g' % var[:].max(), horizontalalignment = 'center', verticalalignment = 'bottom') # cbar.ax.text(.5, -.06, '%.3g ' % var[:].min(), horizontalalignment = 'center', verticalalignment = 'top') # else: # cbar.ax.text(1.05, .5, ' %.3g' % var[:].max(), verticalalignment = 'center', horizontalalignment = 'left') # cbar.ax.text(-.06, .5, '%.3g ' % var[:].min(), verticalalignment = 'center', horizontalalignment = 'right') #cbar.update_ticks() fmt = 'png' outpath = args.outpath if len(ifiles) > 1: lstr = str(fi).rjust(len(str(len(ifiles))), '0') else: lstr = '' figpath = os.path.join(outpath + varkey + lstr + '.' + fmt) if args.interactive: csl = PNCConsole(locals=globals()) csl.interact() fig.savefig(figpath) if args.verbose > 0: print('Saved fig', figpath)
import pandas as pd from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm import scipy.stats as ss ############ # one way ############ # prepare df = pd.read_csv('oneway.csv') a = df[df['algo'] == 'a']['ratio'] b = df[df['algo'] == 'b']['ratio'] # 1/4: 正态性 ss.normaltest(a); ss.normaltest(b) # 2/4: 方差齐性 args=[a,b] ss.levene(*args) # F test ss.f_oneway(*args) # F test too model = ols('ratio ~ algo', df).fit() anovat = anova_lm(model)
def calc_ttest_dict(a, b, paired=False): ''' Calculate the comparison between the two sets of data Importantly, although the stars will be the same, this code accurately applies either a Student's t, Welch's t, or Mann Whitney U test ''' # Import what you need import numpy as np from scipy.stats import ttest_ind, ttest_rel, bartlett, mannwhitneyu, normaltest, wilcoxon stats_dict = {} # Mask out the not a numbers a = [ x for x in a if not np.isnan(x) ] b = [ x for x in b if not np.isnan(x) ] # Save number of people in each group stats_dict['n'] = (len(a), len(b)) # Conduct test for equal variance stats_dict['eqvar'] = bartlett(a, b) # Conduct test for normality stats_dict['normal'] = normaltest(np.hstack([a, b])) # When you test for equal means (ttest) you have different options # depending on if you have equal variances or not. You can also # run the non-parametric Mann Whitney U test # Alternatively these data may be paired so there's also the # paired t-test and the Wilcoxon signed rank test # All five will be entered in the stats_dict # Conduct Welch's t-test (unequal variances) stats_dict['ttest_uneqvar'] = ttest_ind(a, b, equal_var = False) # Conduct standard student's t-test (equal variances) stats_dict['ttest_eqvar'] = ttest_ind(a, b, equal_var = True) # Conduct mann whitney U test (non-parametric test of medians) stats_dict['mannwhitneyu'] = mannwhitneyu(a, b) if paired: # Conduct the paired student's t-test stats_dict['ttest_paired'] = ttest_rel(a, b) # Conduct Wilcoxon signed rank test (non-parametric *paired* test of medians) stats_dict['wilcoxon'] = wilcoxon(a, b) # Save in the stats dict the various other measures you might # want to report stats_dict['medians'] = [np.percentile(a, 50), np.percentile(b, 50)] stats_dict['percentile25'] = [np.percentile(a, 25), np.percentile(b, 25)] stats_dict['percentile75'] = [np.percentile(a, 75), np.percentile(b, 75)] stats_dict['means'] = [np.mean(a), np.mean(b)] stats_dict['stds'] = [np.std(a), np.std(b)] stats_dict['dfs'] = [(np.float(stats_dict['n'][0])-1), (np.float(stats_dict['n'][1])-1)] stats_dict['pooled_std'] = np.sqrt( (np.float(stats_dict['dfs'][0])*(np.float(stats_dict['stds'][0])**2) + np.float(stats_dict['dfs'][1])*(np.float(stats_dict['stds'][0])**2)) / (np.float(stats_dict['dfs'][0]) + np.float(stats_dict['dfs'][1]))) if paired: stats_dict['mean_difference'] = np.mean(np.array(b)-np.array(a)) stats_dict['std_difference'] = np.std(np.array(b)-np.array(a)) stats_dict['median_difference'] = np.percentile(np.array(b)-np.array(a), 50) stats_dict['percentile25_difference'] = np.percentile(np.array(b)-np.array(a), 25) stats_dict['percentile75_difference'] = np.percentile(np.array(b)-np.array(a), 75) stats_dict['cohens_d'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['pooled_std']) stats_dict['cohens_d_paired'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['std_difference']) return stats_dict
print('Probably the same distribution') else: print('Probably different distributions') stat8, p8 = ttest_ind(Parents_2019["soc_omg_1"], Parents_2019["soc_omg_2"]) print('stat=%.3f, p=%.3f' % (stat8, p8)) if p8 > 0.05: print('Probably the same distribution') else: print('Probably different distributions') #Test all the assumptions #test whether normal distributions stat, p = normaltest(Moved_out_2020["attitu_2"]) print('stat=%.3f, p=%.3f' % (stat, p)) if p > 0.05: print('Probably Gaussian') else: print('Probably not Gaussian') stat3, p3 = normaltest(Parents_2020["attitu_2"]) print('stat=%.3f, p=%.3f' % (stat3, p3)) if p3 > 0.05: print('Probably Gaussian') else: print('Probably not Gaussian') stat9, p9 = normaltest(Moved_out_2019["attitu_2"]) print('stat=%.3f, p=%.3f' % (stat9, p9))
from termcolor import colored, cprint import matplotlib.pyplot as plt import numpy as np import numpy as np from scipy import stats # Generating a normal distribution sample # with 100 elements sample = np.random.randn(20) print(colored(('sample', sample), 'green')) # normaltest tests the null hypothesis. out = stats.normaltest(sample) print('normaltest output') print('Z-score = ' + str(out[0])) print('P-value = ' + str(out[1])) # kstest is the Kolmogorov-Smirnov test for goodness of fit. # Here its sample is being tested against the normal distribution. # D is the KS statistic and the closer it is to 0 the better. out = stats.kstest(sample, 'norm') print('\nkstest output for the Normal distribution') print('D = ' + str(out[0])) print('P-value = ' + str(out[1])) # Similarly, this can be easily tested against other distributions, # like the Wald distribution. out = stats.kstest(sample, 'wald') print('\nkstest output for the Wald distribution') print('D = ' + str(out[0])) print('P-value = ' + str(out[1]))
# resample data to create time bars and compare normality tests with tick data def get_bar_stats(agg_trades): vwap = agg_trades.apply( lambda x: np.average(x.price, weights=x.shares)).to_frame('vwap') ohlc = agg_trades.price.ohlc() vol = agg_trades.shares.sum().to_frame('vol') txn = agg_trades.shares.size().to_frame('txn') return pd.concat([ohlc, vwap, vol, txn], axis=1) resampled = trades.resample('1Min') time_bars = get_bar_stats(resampled) # norrmality test for tick rets normaltest(tick_bars.price.pct_change().dropna()) # compare to min rets normaltest(time_bars.vwap.pct_change().dropna()) price_volume(time_bars) # time bars don't always account for fragmentation of orders. Volume bars offer an alternative perspective with pd.HDFStore(order_book_store) as store: trades = store['{}/trades'.format(stock)] trades.price = trades.price.mul(1e-4) trades = trades[trades.cross == 0] trades = trades.between_time(market_open, market_close).drop('cross', axis=1) trades.info() trades_per_min = trades.shares.sum() / (60 * 7.5) # min per trading day trades['cumul_vol'] = trades.shares.cumsum()
def q4(): # Retorne aqui o resultado da questão 4. log_weight = np.log(amostra_weight) statistic, p_value = sct.normaltest(log_weight) return bool(p_value > ALPHA)
import numpy as np from scipy import stats pts = 1000 np.random.seed(28041990) a = np.random.normal(0, 1, size=pts) b = np.random.normal(2, 1, size=pts) x = np.concatenate((a, b)) k2, p = stats.normaltest(x) alpha = 0.05 print("p = {:g}".format(p)) if p < alpha: # null hypothesis: x comes from a normal distribution print("The null hypothesis can be rejected") else: print("The null hypothesis cannot be rejected")
def q3(): # Retorne aqui o resultado da questão 3. statistic, p_value = sct.normaltest(amostra_weight) return bool(p_value > ALPHA)
def get_all_subsets(X, y, mallows=True): combs = [] results = [] for i in range(1, len(X) + 1): els = [list(x) for x in itertools.combinations(X, i)] combs.extend(els) for comb in combs: model = sm.OLS(y, sm.add_constant(X[list(comb)])) result = model.fit() results.append({ "model": model, "result": result, "num_vars": len(comb), "vars": X[list(comb)] }) full_mse_res = sm.OLS(y, sm.add_constant(X)).fit().mse_resid acceptable_models = {} for model in results: not_acceptable = False for pvalue in model["result"].pvalues: if pvalue > 0.05: not_acceptable = True break if not_acceptable: continue mallows_objective = model["num_vars"] curr_mallows = mallow_cp(model, full_mse_res, X.shape[0]) curr_min = None if model["num_vars"] in acceptable_models and len( acceptable_models[model["num_vars"]]) > 9: curr_min = acceptable_models[model["num_vars"]][-1]["mallows"] model["mallows"] = curr_mallows model["mallows_diff"] = abs(curr_mallows - mallows_objective) if not curr_min is None: if model["mallows_diff"] < abs(curr_min - mallows_objective): del acceptable_models[model["num_vars"]][-1] acceptable_models[model["num_vars"]].append(model) else: continue else: if not model["num_vars"] in acceptable_models: acceptable_models[model["num_vars"]] = [] acceptable_models[model["num_vars"]].append(model) acceptable_models[model["num_vars"]] = \ sorted(acceptable_models[model["num_vars"]], key=lambda k: k['mallows_diff']) curr_best = None for num_vars in acceptable_models: for model in acceptable_models[num_vars]: if curr_best is None: curr_best = model else: if curr_best["mallows_diff"] > model["mallows_diff"]: curr_best = model print(curr_best["result"].summary()) std = curr_best["model"].exog.std(0) std[0] = 1 tt = curr_best["result"].t_test(np.diag(std)) print(tt.summary()) tt.summary_frame() fig = plt.figure(figsize=(12, 30)) sm.graphics.plot_partregress_grid(curr_best["result"]) plt.savefig("resid_ny.png") #plt.show() if False: fig, ax = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(12, 10)) params = list(dict(curr_best["result"].params).keys()) n1 = math.floor(len(params) / 2) n2 = math.floor(len(params) % 2) for i in range(2): for j in range(2): try: ax[i, j].scatter(curr_best["result"].model.exog[:, i * 2 + j], curr_best["result"].resid) ax[i, j].set_xlabel(params[i * 2 + j]) ax[i, j].set_ylabel("resid") ax[i, j].axhline(y=0, color="black") except Exception: break plt.savefig("resid_sf.png") plt.show() #fig = plt.figure(figsize=(12, 10)) #fig = sm.graphics.plot_regress_exog(curr_best["result"], "per_white", fig=fig) fig = sm.graphics.plot_partregress_grid(curr_best["result"], fig=fig) fig.gca().set_title("") plt.suptitle("") plt.savefig("resid_ny.png") #plt.show() stat, p = shapiro(curr_best["result"].resid) print("Shapiro") print('Statistics=%.3f, p=%.3f' % (stat, p)) stat, p = normaltest(curr_best["result"].resid) print("D’Agostino’s") print('Statistics=%.3f, p=%.3f' % (stat, p)) stat, p = kstest(curr_best["result"].resid, 'norm') print("Kolmogorov-Smirnov") print('Statistics=%.3f, p=%.3f' % (stat, p)) #plot_mallows(acceptable_models) return curr_best["result"].rsquared_adj for var in curr_best["vars"]: coef = curr_best["result"].params[var] pos_neg = "pos" if coef < 0: pos_neg = "neg" try: dct[var + "_" + pos_neg] += 1 except Exception: dct[var + "_" + pos_neg] = 1
#normalcy test elif op == "jarqBera": jb, jbpv, skew, kurtosis = jarque_bera(data) printStat(jb, jbpv, "probably gaussian", "probably not gaussian") print(f'skew: {skew}') print(f'kurtosis: {kurtosis}') #shapiro wilks normalcy test elif op == "shapWilk": stat, pvalue = shapiro(data) printStat(stat, pvalue, "probably gaussian", "probably not gaussian") #D’Agostino’s K square normalcy test elif op == "dagast": stat, pvalue = normaltest(data) printStat(stat, pvalue, "probably gaussian", "probably not gaussian") #anderson darling normalcy test elif op == "andar": result = anderson(data) print("stat {:.3f}".format(result.statistic)) for i in range(len(result.critical_values)): sl, cv = result.significance_level[i], result.critical_values[i] if int(sl) == 5: if result.statistic < cv: print("probably gaussian at the {:.1f} level".format(sl)) else: print( "probably not gaussian at the {:.1f} level".format(sl)) #histogram
# # * Esse resultado faz sentido? # <font size = '5' color = 'green'>Este resultado é qualitativamente igual ao resultado fornecido pelo teste de Shapiro-Wilk, diferindo apenas quantitativamente em relação ao valor p, portanto, faz sentido. </font> # ## Questão 3 # # Considerando agora uma amostra de tamanho 3000 da coluna `weight` obtida com a função `get_sample()`. Faça o teste de normalidade de D'Agostino-Pearson utilizando a função `scipy.stats.normaltest()`. Podemos afirmar que os pesos vêm de uma distribuição normal ao nível de significância de 5%? Responda com um boolean (`True` ou `False`). # In[14]: sub_weight = get_sample(df, 'weight', n=3000) # In[15]: ap_t, ap_pvalue = sct.normaltest(sub_weight) ap_pvalue # In[16]: def q3(): return (ap_pvalue > 0.05) # Retorne aqui o resultado da questão 3. pass q3() # In[17]:
# %% Statistics stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value']) posthoc_tests = {} for i, var in enumerate([ 'perf_easy', 'threshold_l', 'threshold_r', 'threshold_n', 'bias_l', 'bias_r', 'bias_n' ]): # Remove any animals with NaNs test_fits = biased_fits[biased_fits[var].notnull()] # Test for normality _, normal = stats.normaltest(test_fits[var]) if normal < 0.05: test_type = 'kruskal' test = stats.kruskal( *[group[var].values for name, group in test_fits.groupby('lab')]) if test[1] < 0.05: # Proceed to posthocs posthoc = sp.posthoc_dunn(test_fits, val_col=var, group_col='lab') else: posthoc = np.nan else: test_type = 'anova' test = stats.f_oneway( *[group[var].values for name, group in test_fits.groupby('lab')]) if test[1] < 0.05: posthoc = sp.posthoc_tukey(test_fits, val_col=var, group_col='lab')
def get_corr_matrix_data(self, options, included_vars=None, extra_vars=None): if included_vars is None: included_vars = list(self.data) if extra_vars is not None: included_vars = included_vars + extra_vars else: extra_vars = [] categories = [ c for c in list(self.data) if 'date' not in c.lower() and c in included_vars ] categories.extend(extra_vars) categories = list(set(categories)) categories.sort() var_count = len(categories) categories_for_label = [ category.replace("Control Point", "CP") for category in categories ] categories_for_label = [ category.replace("control point", "CP") for category in categories_for_label ] categories_for_label = [ category.replace("Distance", "Dist") for category in categories_for_label ] for i, category in enumerate(categories_for_label): if category.startswith('DVH'): categories_for_label[i] = category.split("DVH Endpoint: ")[1] x_factors = categories_for_label y_factors = categories_for_label[::-1] s_keys = [ 'x', 'y', 'x_name', 'y_name', 'color', 'alpha', 'r', 'p', 'size', 'x_normality', 'y_normality', 'group' ] source_data = { 'corr': {sk: [] for sk in s_keys}, 'line': { 'x': [0.5, var_count - 0.5], 'y': [var_count - 0.5, 0.5] } } min_size, max_size = 3, 20 removed_mrns = set() for x in range(var_count): for y in range(var_count): if x > y and self.group == 1 or x < y and self.group == 2: if categories[x] not in extra_vars and categories[ y] not in extra_vars: bad_indices = [ i for i, v in enumerate(self.data[categories[x]] ['values']) if type(v) in [str, type(None)] ] bad_indices.extend([ i for i, v in enumerate(self.data[categories[y]] ['values']) if type(v) in [str, type(None)] ]) bad_indices = list(set(bad_indices)) removed_mrns = removed_mrns.union( set(self.mrns[i] for i in bad_indices)) x_data = [ v for i, v in enumerate(self.data[categories[x]] ['values']) if i not in bad_indices ] y_data = [ v for i, v in enumerate(self.data[categories[y]] ['values']) if i not in bad_indices ] if x_data and len(x_data) == len(y_data): r, p_value = scipy_stats.pearsonr(x_data, y_data) else: r, p_value = 0, 0 if np.isnan(r): r = 0 sign = ['neg', 'pos'][r >= 0] color = getattr( options, 'CORRELATION_%s_COLOR_%s' % (sign.upper(), self.group)) source_data['corr']['color'].append(color) source_data['corr']['r'].append(r) source_data['corr']['p'].append(p_value) source_data['corr']['alpha'].append(abs(r)) source_data['corr']['size'].append(( (max_size - min_size) * abs(r)) + min_size) source_data['corr']['x'].append( x + 0.5) # 0.5 offset due to bokeh 0.12.9 bug source_data['corr']['y'].append( var_count - y - 0.5) # 0.5 offset due to bokeh 0.12.9 bug source_data['corr']['x_name'].append( categories_for_label[x]) source_data['corr']['y_name'].append( categories_for_label[y]) source_data['corr']['group'].append(self.group) try: x_norm, x_p = scipy_stats.normaltest(x_data) except ValueError: x_p = 'N/A' try: y_norm, y_p = scipy_stats.normaltest(y_data) except ValueError: y_p = 'N/A' source_data['corr']['x_normality'].append(x_p) source_data['corr']['y_normality'].append(y_p) return { 'source_data': source_data, 'x_factors': x_factors, 'y_factors': y_factors }, removed_mrns
def analyze(initDate, finalDate, data_type="daily"): exchange = 'CCCAGG' completeOnly = True exWeekends = False # aggregated hourly price for Bitcoin (2000 row limit - use a loop) symbol = 'BTCUSD' BTCUSD = gd.getCrypto(symbol, initDate, finalDate, exchange, completeOnly, exWeekends, data_type=data_type) symbol = 'LTCBTC' LTCBTC = gd.getCrypto(symbol, initDate, finalDate, exchange, completeOnly, exWeekends, data_type=data_type) symbol = 'ETHBTC' ETHBTC = gd.getCrypto(symbol, initDate, finalDate, exchange, completeOnly, exWeekends, data_type=data_type) # store to disk BTCUSD.to_csv('./csv/BTCUSD.csv') LTCBTC.to_csv('./csv/LTCBTC.csv') ETHBTC.to_csv('./csv/ETHBTC.csv') # convert to pctdiffs dBTC = (BTCUSD.diff() / BTCUSD.shift()).dropna() dLTC = (LTCBTC.diff() / LTCBTC.shift()).dropna() dETH = (ETHBTC.diff() / ETHBTC.shift()).dropna() agg = pd.DataFrame([dBTC.Close, dLTC.Close, dETH.Close]).transpose() agg.columns = ['dBTC', 'dLTC', 'dETH'] # check correlations cAgg = np.corrcoef(agg.dropna(), rowvar=False) vAgg = np.cov(agg.dropna(), rowvar=False) # cut bottom 1% and top 1% of data points - prune outliers def middle(series, percentile): temp = series.sort_values(inplace=False) pctLen = int(round(len(temp) * percentile / 2, 0)) temp = temp[pctLen:len(temp) - pctLen].sort_index() return temp # test for stationarity percentile = .02 spreadLTC = (dLTC / dBTC).Close.dropna() spreadETH = (dETH / dBTC).Close.dropna() # sBTC = adfuller(dBTC.Close) # sLTCBTC = adfuller(spreadLTC) # sIOTBTC = adfuller(spreadIOT) # sETHBTC = adfuller(spreadETH) # if stationary and correlated, check for normal distribution k2, p = stats.normaltest(spreadLTC) # p <= .05 mLTC = middle((dLTC / dBTC).Close.dropna(), percentile) mETH = middle((dETH / dBTC).Close.dropna(), percentile) sdLTC = np.std(mLTC) mnLTC = np.mean(mLTC) assdLTC = spreadLTC / sdLTC # not using middles # display histogram spreadLTC.hist(range=[-20, 20], bins=100) assdLTC.hist(range=[-5, 5], bins=100) # sanity check prunedPct = len(assdLTC[np.abs(assdLTC) >= 3]) / len(assdLTC) + percentile # slice into sd levels and check autocorrelations def checkAutocorrelations(series, sdbottom, sdtop, lags): glomSeries = pd.DataFrame(series) for lag in range(1, lags + 1): glomSeries = glomSeries.join(pd.DataFrame(series.shift(lag)), rsuffix=str(lag), how='outer') subSeries = glomSeries[(np.abs(glomSeries.Close) >= sdbottom) & (np.abs(glomSeries.Close) < sdtop)].dropna() corrs = np.corrcoef(subSeries, rowvar=False) mainCol = subSeries.Close winProps = np.empty(0) for col in subSeries.columns: winners = subSeries[(((mainCol > 0) & (mainCol > subSeries[col])) | ((mainCol < 0) & (mainCol < subSeries[col])))] winProp = len(winners) / len(subSeries) winProps = np.append(winProps, winProp) return corrs[0], winProps # check autocorrelation priorSD = 0 for thisSD in np.arange(0.25, 5.25, 0.25): cor, win = checkAutocorrelations(spreadLTC, priorSD, thisSD, 9) print(thisSD, "C", cor) print(thisSD, "W", win) priorSD = thisSD return
from scipy import stats import matplotlib.pyplot as plt generated = stats.norm.rvs(size=900) print "Mean", "Std", stats.norm.fit(generated) print "Skewtest", "pvalue", stats.skewtest(generated) print "Kurtosistest", "pvalue", stats.kurtosistest(generated) print "normaltest", "pvalue", stats.normaltest(generated) print "95 percentile", stats.scoreatpercentile(generated, 95) print "Percentile at 1", stats.percentileofscore(generated, 1) plt.hist(generated) plt.show()
def q4(): # Retorne aqui o resultado da questão 4. weight_log = np.log(weight) k2, p = sct.normaltest(weight_log) return bool(p > alpha)
""" 显著性检验:方差分析(Analysis of Variance,ANOVA,F 检验) 随机性:样本是随机采样但 独立性:来自不同组但样本是相互独立但 正太分布性:组内样本都来自一个正太分布 方差齐性:不同组但方差相等或相近 """ # 读取数据, d1 对应于算法 a,d2 对应于算法 b df = pd.read_csv("./oneway.csv") d1 = df[df['algo'] == 'a']['ratio'] d2 = df[df['algo'] == 'b']['ratio'] # 检验两个水平的正态性 print('---------------- 检验两个水平的正态性 ----------------') print(ss.normaltest(d1)) print(ss.normaltest(d2)) # 检测两个水平的方差齐性 print('---------------- 检测两个水平的方差齐性 ----------------') args = [d1, d2] print(ss.levene(*args)) # F 检验的第一种方法 print('---------------- F 检验的第一种方法 ----------------') print(ss.f_oneway(*args)) # F 检验的第二种方法 print('---------------- F 检验的第二种方法 ----------------') model = ols('ratio ~ algo', df).fit() anovat = anova_lm(model)
print(arma_mod30.params) print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic) # こっちを使うとモデル選択まで行われる(AICが他の方法と大きく異なる?) # arma_mod30 = sm.tsa.AR(dta).fit(maxlag=15, ic='aic', disp=False) # print(arma_mod30.params) # print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic) # check if our model obeys the theory resid = arma_mod30.resid # residual sm.stats.durbin_watson(resid.values) fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(111) ax = arma_mod30.resid.plot(ax=ax) # test if the residual obeys the normal distribution print(stats.normaltest(resid)) fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(111) fig = qqplot(resid, line='q', ax=ax, fit=True) # autocorrelation function and PARCOR of residual fig = plt.figure(figsize=(12, 8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2) r, q, p = sm.tsa.acf(resid.values.squeeze(), qstat=True) data = np.c_[range(1, 41), r[1:], q, p] table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"]) print(table.set_index('lag'))
def evaluate_deviation_from_mean( self, results_reshaped: np.ndarray, result_averaged: np.ndarray, value_input: Union[float, np.ndarray], no_points_averaged: int = 1) -> np.ndarray: dev = results_reshaped - np.tile( np.expand_dims(result_averaged, axis=1), (1, no_points_averaged)) if np.ndim(value_input) == 1: plt.figure() value_input_formatted = np.expand_dims(value_input, axis=1) plt.plot(np.tile(value_input_formatted, reps=(1, no_points_averaged)), dev, marker='.') # n_param_values = np.size(dev, axis=0) + 1 # ax1 = plt.subplot(n_param_values, 1, 1) for i in range(0, np.size(dev, axis=0)): h3 = plt.figure() # plt.subplot(n_param_values, 1, i + 1).\ plt.hist(dev[i, :], stacked=False, label=str(value_input[i]), density=True) # plt.xlim([-0.1, 0.1]) plt.title(str(value_input[i])) if self.get_data_saver() is not None: self.get_data_saver().save_figure( h3, ("deviations_over_parameter%d" % i)) # else: # value_input_formatted = value_input h1 = plt.figure() plt.hist(np.ravel(dev), 100, density='true') mean_est = np.mean(np.ravel(dev)) std_est = np.std(np.ravel(dev)) x = np.linspace(np.min(np.ravel(dev)), np.max(np.ravel(dev))) random_normal = stats.norm(mean_est, std_est).pdf(x) plt.plot(x, random_normal, '--r', label='Fitted normal distribution') plt.legend() if not self.get_data_saver() is None: self.get_data_saver().save_figure(h1, "histogram_deviations_vadere") # plt.savefig('histogram_deviations_vadere.png') sm.qqplot(np.ravel(dev), line='s') h2 = plt.gcf() # plt.savefig('qqplot_deviations_vadere.png') if not self.get_data_saver() is None: self.get_data_saver().save_figure(h2, "qqplot_deviations_vadere") plt.close(h2) vadere_logger = logging.getLogger( "vaderemodel.evaluate_deviation_from_mean") vadere_logger.info("Vadere evaluations: Deviations from average") vadere_logger.info("Mean: %f, Std: %f" % (mean_est, std_est)) # skewtest needs at least 8 samples if len(np.ravel(dev)) >= 20: alpha = 0.01 k2, p = stats.normaltest(np.ravel(dev)) vadere_logger.info("p = {:g}".format(p)) if p < alpha: # null hypothesis: x comes from a normal distribution vadere_logger.info("The null hypothesis can be rejected") else: vadere_logger.info("The null hypothesis cannot be rejected") return dev
sleep(pause_time) ############################################################################### # Update Volume and Diff ############################################################################### if 1: diff = cu - cu.shift(1) diff = diff.dropna() values = diff.eur.values step = 20 alpha = 1e-6 for i in range(step, values.shape[0], step): vals = values[values.shape[0] - step:] k2, p = stats.normaltest(vals) if p < alpha: # null hypothesis: x comes from a normal distribution print('{}: break'.format(i)) #print("The null hypothesis can be rejected") else: #print("The null hypothesis cannot be rejected") if i % 3000 == 0: print(i) os.system('say "Completed"') ############################################################################### # Plot Everything. Ratios first ############################################################################### if 1: cu.eur.plot(title='cu')
b = (6.0 * (n**2.0 - 5.0 * n + 2.0)) / ((n + 7.0) * (n + 9.0)) b *= np.sqrt((6.0 * (n + 3.0) * (n + 5.0)) / (n * (n - 2.0) * (n - 3.0))) A = 6.0 + (8.0 / b) * (2.0 / b + np.sqrt(1.0 + 4.0 / b**2.0)) z = (1.0 - 2.0 / A) / (1.0 + X * np.sqrt(2.0 / (A - 4.0))) z = (1.0 - 2.0 / (9.0 * A)) - z**(1.0 / 3.0) z /= np.sqrt(2.0 / (9.0 * A)) return z K2 = Z1(S, N)**2.0 + Z2(K, N)**2.0 print('Omnibus: {}'.format(K2)) p = 1.0 - stats.chi2(2).cdf(K2) print('Pr( Omnibus ) = {}'.format(p)) (K2, p) = stats.normaltest(result.resid) print('Omnibus: {0}, p = {1}'.format(K2, p)) # --------------------- JB = (N / 6.0) * (S**2.0 + (1.0 / 4.0) * (K - 3.0)**2.0) p = 1.0 - stats.chi2(2).cdf(JB) print('JB-statistic: {:.5f}, p-value: {:.5f}'.format(JB, p)) # --------------------- X = np.matrix(X) EV = np.linalg.eig(X * X.T) print(EV) CN = np.sqrt(EV[0].max() / EV[0].min()) print('Condition No.: {:.5f}'.format(CN))