Python shapiro Examples, scipy.stats.shapiro Python Examples

Example #1

0

Show file

File: test_morestats.py Project: GiladAmar/scipy

    def test_basic(self):
        x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46,
              4.43,0.21,4.75,0.71,1.52,3.24,
              0.93,0.42,4.97,9.53,4.55,0.47,6.66]
        w,pw = stats.shapiro(x1)
        assert_almost_equal(w,0.90047299861907959,6)
        assert_almost_equal(pw,0.042089745402336121,6)
        x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11,
              3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69,
              0.08,3.67,2.81,3.49]
        w,pw = stats.shapiro(x2)
        assert_almost_equal(w,0.9590270,6)
        assert_almost_equal(pw,0.52460,3)

        # Verified against R
        np.random.seed(12345678)
        x3 = stats.norm.rvs(loc=5, scale=3, size=100)
        w, pw = stats.shapiro(x3)
        assert_almost_equal(w, 0.9772805571556091, decimal=6)
        assert_almost_equal(pw, 0.08144091814756393, decimal=3)

        # Extracted from original paper
        x4 = [0.139, 0.157, 0.175, 0.256, 0.344, 0.413, 0.503, 0.577, 0.614,
              0.655, 0.954, 1.392, 1.557, 1.648, 1.690, 1.994, 2.174, 2.206,
              3.245, 3.510, 3.571, 4.354, 4.980, 6.084, 8.351]
        W_expected = 0.83467
        p_expected = 0.000914
        w, pw = stats.shapiro(x4)
        assert_almost_equal(w, W_expected, decimal=4)
        assert_almost_equal(pw, p_expected, decimal=5)

Example #2

0

Show file

File: solution.py Project: momochen/workspace

    def mleWithSgd(self,x_array,y_array):
        a,b,theta,loss = random.random(),random.random(),random.random(),2**31
        optimal_a,optimal_b,optimal_theta = 0,0,0
        for i in xrange(len(x_array)):
            x,y = x_array[i],y_array[i]
            a = a - self.learning_rate * (1/(theta*x)*(a*x+b-y))
            b = b - self.learning_rate * (1/(theta*(x**2))*(a*x+b-y))
            theta = theta - self.learning_rate * (-((y-a*x-b)**2)/((x**2)*(theta**3)) - theta)
            curr_loss = self.mleLossFunc(x_array,y_array,a,b,theta)
            if curr_loss<=loss:
                self.learning_rate*=1.05
                optimal_a,optimal_b,optimal_theta = a,b,theta
            else:
                self.learning_rate*=0.5
                a,b,theta = optimal_a,optimal_b,optimal_theta
            loss = curr_loss

            print curr_loss

        print "Output:"
        #print a,b,theta,self.learning_rate
        print optimal_a,optimal_b,optimal_theta,len(x_array)
        
        # Evaluation
        # Perform Shapiro-Wilk test
        # Which tests the null hypothesis that the data was drawn from a normal distribution.
        normalized_array = np.array([(y_array[i]-optimal_a*x_array[i]-optimal_b)/(optimal_theta*x_array[i]) for i in range(len(x_array))])
        print stats.shapiro(normalized_array)
        #plt.plot(list(x_array),list(y_array),'ro')
        #plt.show()
        return optimal_a,optimal_b,optimal_theta

Example #3

0

Show file

File: store.py Project: gree-gorey/losc

    def test(self, arr1, arr2):
        p_value = 0
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if stats.levene(arr1, arr2)[1] > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    p_value = stats.ttest_ind(arr1, arr2)[1]
                else:
                    # p = Mann
                    if equal(arr1, arr2):
                        p_value = 1
                    else:
                        p_value = stats.mannwhitneyu(arr1, arr2)[1]
            else:
                p_value = stats.ttest_ind(arr1, arr2, False)[1]

        elif self.statistics == "student":
            p_value = stats.ttest_ind(arr1, arr2)[1]
        elif self.statistics == "welch":
            p_value = stats.ttest_ind(arr1, arr2, False)[1]
        elif self.statistics == "mann":
            if equal(arr1, arr2):
                p_value = 1
            else:
                p_value = stats.mannwhitneyu(arr1, arr2)[1]
        return p_value

Example #4

0

Show file

File: solution.py Project: momochen/workspace

    def mleWithSgdNonlinear(self,x_array,y_array):
        a,b,theta0,theta1,theta2,loss = random.random(),random.random(),random.random(),random.random(),random.random(),2**31
        optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 = 0,0,0,0,0
        for i in xrange(len(x_array)):
            x,y = x_array[i],y_array[i]
            a = a - self.learning_rate * (x*(a*x+b-y)/(theta0*x**2+theta1*x+theta2) + self.reg_cof*a)
            b = b - self.learning_rate * ((a*x+b-y)/(theta0*x**2+theta1*x+theta2) + self.reg_cof*b)
            theta0 = theta0 - self.learning_rate * ((- (x**2) * ((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + x**2/(theta0*(x**2)+theta1*x+theta2) ) + self.reg_cof*theta0)
            theta1 = theta1 - self.learning_rate * ((-x * ((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + x/(theta0*(x**2)+theta1*x+theta2) ) + self.reg_cof*theta1) 
            theta2 = theta2 - self.learning_rate * (-((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + 1/(theta0*(x**2)+theta1*x+theta2) + self.reg_cof*theta2)
            curr_loss = self.mleLossNonlinear(x_array,y_array,a,b,theta0,theta1,theta2)
            
            if curr_loss<loss:
                self.learning_rate*=1.05
                optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 = a,b,theta0,theta1,theta2
            else:
                self.learning_rate*=0.5
                a,b,theta0,theta1,theta2 = optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2
            
            loss = curr_loss
            print curr_loss
        print "Output:"
        #print a,b,theta,self.learning_rate
        print optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2
        
        # Evaluation
        # Perform Shapiro-Wilk test
        # Which tests the null hypothesis that the data was drawn from a normal distribution.
        normalized_array = np.array([(y_array[i]-optimal_a*x_array[i]-optimal_b)/(optimal_t0*(x_array[i]**2)+optimal_t1*x_array[i]+optimal_t2) for i in range(len(x_array))])

        print stats.shapiro(normalized_array)
        plt.plot(normalized_array,[1]*len(normalized_array),'ro')
        plt.show()

Example #5

0

Show file

File: significantly_different_inner.py Project: eunchong/build

  def main():
    if len(sys.argv) < 4:
      return 1
    _, list_a, list_b, significance = sys.argv[:4]
    list_a = json.loads(list_a)
    list_b = json.loads(list_b)
    significance = float(significance)

    shapiro_p_value = stats.shapiro(list_a)[1], stats.shapiro(list_b)[1]
    mann_whitney_p_value = stats.mannwhitneyu(list_a, list_b).pvalue
    anderson_p_value = stats.anderson_ksamp([list_a, list_b]).significance_level
    welch_p_value = stats.ttest_ind(list_a, list_b, equal_var=False)[1]

    results = {
        'first_sample': list_a,
        'second_sample': list_b,
        'shapiro_p_value': shapiro_p_value,
        'mann_p_value': mann_whitney_p_value,
        'anderson_p_value': anderson_p_value,
        'welch_p_value': welch_p_value,
    }

    if (results['shapiro_p_value'][0] < significance and
        results['shapiro_p_value'][1] < significance):
      results['normal-y'] = True
    else:
      results['normal-y'] = False
    results['significantly_different'] = bool(
        float(results['mann_p_value']) < float(significance))

    print json.dumps(results)
    return 0

Example #6

0

Show file

File: boxcoxtrans.py Project: serendio-labs-stage/diskoveror-datapreprocessing-python

def boxcoxtrans(str,list):
        s=list
        w = pd.read_csv(str, usecols=s)

        f = DataFrame(w)
        c = f.astype(float)

        x = c.as_matrix()


        e = []

        for j in np.linspace(-2, 2, num=21):

                if j != 0:

                    b =(x**j)

                    d=[]
                    c=[]
                    for i in range(0,len(b)):
                        c = b[i]
                        d.append(c[0])
                    

                    t = stats.shapiro(d)
                    
                    
                    e.append(t[1])




        for i in range(0,len(e)):

            if e[i]==max(e):

                break
        t=(-2+0.2*i)

        if t>=0:
            t=(-2+0.2*(i+1))

        print 'optimal lembda=',t

        h=((x**t)-1)/t
        l=[]
        m=[]
        for i in range(0,len(h)):
            l = h[i]
            m.append(l[0])


        print pd.DataFrame(m)
        k=stats.shapiro(m)

        print 'shapiro test of trans column',k

Example #7

0

Show file

File: store.py Project: gree-gorey/losc

    def return_test_results(self, arr1, arr2):
        test_name = ""
        p_value = 0
        t_value = 0
        levene = stats.levene(arr1, arr2)[1]
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if levene > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    test_name = "Student"
                    result = stats.ttest_ind(arr1, arr2)
                    t_value = result[0]
                    p_value = result[1]
                else:
                    # p = Mann
                    test_name = "Mann"
                    if equal(arr1, arr2):
                        t_value = None
                        p_value = 1
                    else:
                        result = stats.mannwhitneyu(arr1, arr2)
                        t_value = result[0]
                        p_value = result[1]
            else:
                test_name = "Welch"
                result = stats.ttest_ind(arr1, arr2, False)
                t_value = result[0]
                p_value = result[1]

        elif self.statistics == "student":
            test_name = "Student"
            result = stats.ttest_ind(arr1, arr2)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "welch":
            test_name = "Welch"
            result = stats.ttest_ind(arr1, arr2, False)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "mann":
            test_name = "Mann"
            if equal(arr1, arr2):
                t_value = None
                p_value = 1
            else:
                result = stats.mannwhitneyu(arr1, arr2)
                t_value = result[0]
                p_value = result[1]

        df = len(arr1) + len(arr2) - 2

        return [test_name, t_value, p_value, df, levene]

Example #8

0

Show file

File: test_arraymultinomial.py Project: quanta413/Population-Evolution-Project-Source-Code

def test_sample_means_and_var_distribution(N, Pis, sample_size, multi, n_test):
    x_pvalues = []
    y_pvalues = []
    passed = []
    for i in range(n_test):
        x, y = multinomial_mean_and_var_errors(N, Pis, sample_size, multi)
        x_pvalue = spstats.shapiro(x)[1]
        y_pvalue = spstats.shapiro(y)[1]
        x_pvalues.append(x_pvalue)
        y_pvalues.append(y_pvalue)
        passed.append(min(x_pvalue, y_pvalue) >= .05)
    assert np.sum(np.array(passed)) >= .6 * n_test

Example #9

0

Show file

File: ISP_checkNormality.py Project: ChengduoZhao/statsintro_python

def check_normality():
    '''Check if the distribution is normal.'''
    
    # Set the parameters
    numData = 1000
    myMean = 0
    mySD = 3
    
    # To get reproducable values, I provide a seed value
    np.random.seed(1234)   
    
    # Generate and show random data
    data = stats.norm.rvs(myMean, mySD, size=numData)
    fewData = data[:100]
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    pFewVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['Omnibus']    = stats.normaltest(data)
    _, pFewVals['Omnibus'] = stats.normaltest(fewData)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk']    = stats.shapiro(data)
    _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)
    
    # Or you can check for normality with Lilliefors-test
    _, pVals['Lilliefors']    = lillifors(data)
    _, pFewVals['Lilliefors'] = lillifors(fewData)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')
    
    print('p-values for all {0} data points: ----------------'.format(len(data)))
    print(pVals)
    print('p-values for the first 100 data points: ----------------')
    print(pFewVals)
    
    if pVals['Omnibus'] > 0.05:
        print('Data are normally distributed')
    # --- >>> STOP stats <<< ---
    
    return pVals['Kolmogorov-Smirnov']

Example #10

0

Show file

File: test_morestats.py Project: joshlk/scipy

 def test_basic(self):
     x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46,
           4.43,0.21,4.75,0.71,1.52,3.24,
           0.93,0.42,4.97,9.53,4.55,0.47,6.66]
     w,pw = stats.shapiro(x1)
     assert_almost_equal(w,0.90047299861907959,6)
     assert_almost_equal(pw,0.042089745402336121,6)
     x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11,
           3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69,
           0.08,3.67,2.81,3.49]
     w,pw = stats.shapiro(x2)
     assert_almost_equal(w,0.9590270,6)
     assert_almost_equal(pw,0.52460,3)

Example #11

0

Show file

File: act-hist.py Project: pe-ge/Computational-analysis-of-memory-capacity-in-echo-state-networks

def main_plot_histogram():
	sigma = 0.10

	# initial setup
	W = dist_W(sigma)
	WI = dist_WI()

	#h, hist_edges = compute_histogram(W, WI, params)
	S, PS = compute_histogram(W, WI, params)
	S = S.flatten()
	PS = PS.flatten()
	#kindofvector(h)
	#kindofvector(hist_edges)
	#print(h)
	#print(hist_edges)
	#plt.plot(hist_edges, h)
	BINCNT = 100

	plt.hist(S,  bins=BINCNT, normed=True, histtype='step', alpha=1, label="act after tanh", color="b")
	plt.hist(PS, bins=BINCNT, normed=True, histtype='step', alpha=1, label="act before tanh", color="g")

	
	#W = shapiro(S)
	print("S size = ", S.size)
	print("shapiro S = ",shapiro(S))
	print("shapiro PS = ",shapiro(PS))
	stdS = std(S)
	print("stdS=",stdS)
	stdPS = std(PS)
	print("stdPS=", stdPS)

	x = linspace(-1, 1, 100)
	y = norm.pdf(x, loc=0, scale=stdS)
	plt.plot(x,y, color="b", alpha=0.2)

	y = norm.pdf(x, loc=0, scale=stdPS)
	plt.plot(x,y, color="g", alpha=0.2)


	#blue_line = mlines.Line2D([], [], color='blue', marker='.', markersize=15, label='Blue stars')
	

	plt.grid(True)
	plt.ylabel('density')
	plt.xlabel('activation value')
	plt.xlim([-1, 1])
	plt.title('activation distibution in reservoir ($\sigma_{blue}$=%.2f, $\sigma_{green}$=%.2f)' % (stdS, stdPS))

	plt.legend()
	plt.show()

Example #12

0

Show file

File: DataMining.py Project: cbick/gps2gtfs

def test_routehop_normality(rows, attributes, key):
    print "Splitting..."
    instances = split_on_attributes(attributes, rows)

    print "Processing..."
    toofew = 0
    nonnormal = 0
    normal = 0
    for skey in instances.keys():
        times = array([s[key] for s in instances[skey]])
        n = len(times)
        mean_time = times.mean()
        std_time = times.std()

        if n >= 30:
            pval = stats.shapiro(times)[1]
            if pval < 0.05:
                nonnormal += 1
                # figure()
                # hist(times)
                # title("%s (p-val=%f, %d pts)" %(str(skey),pval,n));
            else:
                normal += 1
        else:
            toofew += 1

    print "Non,toofew,normal:", nonnormal, toofew, normal

Example #13

0

Show file

File: checkNormality.py Project: CeasarSS/books

def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['omnibus'] = stats.normaltest(data)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    
    # Or you can check for normality with Lilliefors-test
    ksStats, pVals['Lilliefors'] = kstest_normal(data)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    
    print(pVals)
    if pVals['omnibus'] > 0.05:
        print('Data are normally distributed')

Example #14

0

Show file

File: regression.py Project: circstat/pycircstat

    def test(self, alpha, x):
        """
        Tests whether alpha and x are significantly correlated.
        The test assumes that x is normally distributed. The test
        function uses a Shapiro-Wilk test to test this assumption.

        :param alpha: independent variable, angles in radians
        :param x: dependent variable
        :return: test results of Shapiro-Wilk and Liddell-Ord test
        :rtype: pandas.DataFrame

        References: [Jammalamadaka2001]_
        """
        w, psw = stats.shapiro(x)
        if psw < 0.05:
            warnings.warn("This test requires Gaussian distributed x")

        rxc, rxs, rcs = np.corrcoef(x, np.cos(alpha))[0,1], np.corrcoef(x, np.sin(alpha))[0,1], \
                        np.corrcoef(np.cos(alpha), np.sin(alpha))[0,1]
        n = len(alpha)
        r2 = (rxc**2 + rxs**2 - 2*rxc*rxs*rcs)/(1 - rcs**2)
        f = (n-3)*r2/(1-r2)
        p = stats.f.sf(f, 2, n-3)

        df = pd.DataFrame(dict(
            test = ['Shapiro-Wilk','Liddell-Ord'],
            statistics = [w, f],
            p = [psw, p],
            dof = [None, (2, n-3)]
        )).set_index('test')
        return df

Example #15

0

Show file

File: gmode_module.py Project: pedrohasselmann/GmodeClass

def robust_parameter(clusters, stats, elems):
    ''' Parameter to measure robustness of a G-mode test.
        
        The parameter is given by the weighted average plus a normality estimator:
        
        P1 = SUM( N * var ) / SUM( N )
        P2 = SUM( N^-1 * var ) / SUM( N^-1 ) 
        P3 = SUM( kstest(cluster, gaussian) )
        P = (P1/w1 + P2/w2 + P3/w3) / (w1^-1 + w2^-1 + w3^-1)
    '''
    from scipy.stats import shapiro
    from math import sqrt
    from itertools import izip
    
    shap, N, var = deque(), deque(), deque()
    for members, cl in izip(clusters, stats):
        # cluster size array
        N.append(len(members))
        # cluster variance array
        var.append(asum(cl[1]**2))
        # shapiro-wilk test:
        W_vec = array([shapiro(elems[members][n])[0]**2 for n in xrange(len(elems[0]))])
        # inversed shapiro-wilk W statistic.
        shap.append( sqrt(asum(1e0/W_vec)) )

    shap, N, var = array(shap), array(N), array(var)
    
    w1 =  sqrt(asum(mad(var, median(var))**2))
    w3 =  mad(shap, median(shap))
 
    p1 = asum( N * var ) / asum(N)
    p2 = asum( var/N ) / asum(1e0/N)
    p3 = median(shap)
    
    return (p1/w1 + p2/w1 + p3/w3) / (2e0/w1 + 1e0/w3)

Example #16

0

Show file

File: normal_assumption.py Project: Jay4869/project-iota

def sw(errors):
    """
    Shapiro Wilk Test

    The Null hypothesis for SW test is that the data forms a normal 
    distribution.

    Parameters
    -------------
    errors: error of voxels through time (shape of it is 221783*1)

    Returns
    ---------
    swstat: test statistics for SW test
    pval: P-value for the hypothesis test.
    """
    
    pval = []

    for i in range(errors.shape[-1]):
        pval.append(shapiro(errors[:,i])[1])

    pval = np.array(pval)
    shap=pval.shape[0]
    pval = np.reshape(pval, (shap, 1))


    return pval

Example #17

0

Show file

File: experiments_manager.py Project: elwoodxblues/consfinder

 def shapiro_test(self, param):
     from scipy.stats import shapiro
     all_values = self._get_single_param_values(param)
     results = []
     for key, values in all_values:
         results.append((key, shapiro(sorted(values))))
     return results

Example #18

0

Show file

File: phenotype.py Project: timeu/PyGWAS

 def _box_cox_transform(self, verbose=False, method='standard'):
     """
     Performs the Box-Cox transformation, over different ranges, picking the optimal one w. respect to normality.
     """
     from scipy import stats
     a = sp.array(self.values)
     if method == 'standard':
         vals = (a - min(a)) + 0.1 * sp.var(a)
     else:
         vals = a
     sw_pvals = []
     lambdas = sp.arange(-2.0, 2.1, 0.1)
     for l in lambdas:
         if l == 0:
             vs = sp.log(vals)
         else:
             vs = ((vals ** l) - 1) / l
         r = stats.shapiro(vs)
         if sp.isfinite(r[0]):
             pval = r[1]
         else:
             pval = 0.0
         sw_pvals.append(pval)
     i = sp.argmax(sw_pvals)
     l = lambdas[i]
     if l == 0:
         vs = sp.log(vals)
     else:
         vs = ((vals ** l) - 1) / l
     self._perform_transform(vs,"box_cox")
     log.debug('optimal lambda was %0.1f' % l)
     return True

Example #19

0

Show file

File: gui.py Project: rock-simulation/mars

def statFile(key, values, pruneX, pruneX2):
    oFilename = values["file"]+".csv"
    data = []
    prune = False
    minX = 0
    maxX = 0

    if len(pruneX) > 0 and len(pruneX2) > 0:
        minX = float(pruneX)
        maxX = float(pruneX2)
        prune = True

    with open(oFilename) as f:
        for l in f.readlines():
            arrLine = l.strip().split()
            if len(arrLine) == 2:
                t = float(arrLine[0])
                if prune:
                    if t >= minX and t <= maxX:
                        data.append(float(arrLine[1]))
                    elif t > maxX:
                        break
                else:
                    data.append(float(arrLine[1]))
    x = np.array(data)
    with open("stats.txt", "a") as f:
        f.write(key + ":\n")
        f.write("  mean: "+str(x.mean())+"\n")
        f.write("  std: "+str(x.std())+"\n")
        f.write("  median: "+str(np.median(x))+"\n")
        f.write("  min: "+str(x.min())+"\n")
        f.write("  max: "+str(x.max())+"\n")
        f.write("  normality: "+str(stats.shapiro(x)[1])+"\n")

Example #20

0

Show file

File: results_stats.py Project: bokand/catapult

def IsNormallyDistributed(sample, significance_level=0.05,
                          return_p_value=False):
  """Calculates Shapiro-Wilk test for normality.

  Note that normality is a requirement for Welch's t-test.

  Args:
    sample: List of values of benchmark result for a measure.
    significance_level: The significance level the p-value is compared against.
    return_p_value: Whether or not to return the calculated p-value.

  Returns:
    is_normally_distributed: Returns True or False.
    p_value: The calculated p-value.
  """
  if not stats:
    raise ImportError('This function requires Scipy.')

  # pylint: disable=unbalanced-tuple-unpacking
  _, p_value = stats.shapiro(sample)

  is_normally_distributed = p_value >= significance_level
  if return_p_value:
    return is_normally_distributed, p_value
  return is_normally_distributed

Example #21

0

Show file

File: __init__.py Project: kaelfischer/lib_prrsv

    def gStats(self, missingValue=0.0):
        """dict of {geneID: (min,max,mean,median,std,stderr,
        Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...}
        """
        import scipy as S
        import scipy.stats as SS

        rv = {}
        for k, v in self.items():
            # print k,v
            va = S.array(self.gValues(k, missingValue))

            try:
                normaltest = SS.normaltest(va)
            except:
                normaltest = None
            try:
                shapiro = SS.shapiro(va)
            except:
                shapiro = None

            try:
                rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro)
            except:
                print k, va
                raise
        return rv

Example #22

0

Show file

File: multiple_comparison.py Project: LiamFengLin/project-gamma

def multiple_comp (residuals): 
  """
  input: residuals, 2d array (voxels,timecourse)
  output: a list of the number of voxels that being tested as not normally distributed, based on 
  		alpha-test, Bonferroni procedure, Hochberg procedure and  Benjamini-Hochberg procedure respectively
  """

  ## Alpha Test
  p_nor = []
  for i in range(0,residuals.shape[0]):
      p_nor.append(stats.shapiro(residuals[i,:])[1])

  # for p<0.05, the voxel is not normal distributed
  p_nor_005 = [i for i in p_nor if i < 0.05]

  ##Bonferroni Procedure
  p_bonf = [i for i in p_nor if i < (0.05 / residuals.shape[0])]

  ## Hochberg Procedure
  p_nors = np.sort(p_nor)
  alpha = 0.05
  n=len(p_nors)
  tf=[]
  for i in range(0,n):
      thres = alpha/(n+1-(i+1))
      tf.append(p_nors[i]<=thres)

  ##Benjamini-Hochberg procedure
  tf_bh=[]
  for i in range(0,len(p_nors)):
      thres = (i/n)*alpha
      tf_bh.append(p_nors[i]<=thres)

  return [len(p_nor_005),len(p_bonf),sum(tf),sum(tf_bh)]

Example #23

0

Show file

File: enhancement_test.py Project: ThundeRatz/rcss-tools

def run(args):
    report = ResultReportWriter()

    team_names, results = rcss.run_matches(args.team_a, args.team_b, args.match_count)
    report.write_json('match_results.json', {
        'binaries': [args.team_a, args.team_b],
        'teams': team_names,
        'results': results,
    })

    errors = []
    score = [x - y for x, y in results]
    # alpha, 1 - alpha
    # alpha = probability of rejecting a true null hypothesis
    significance, confidence = (args.significance, 1 - args.significance)
    _, normality_p = stats.shapiro(score)
    if normality_p <= significance:
        errors.append('Shapiro test rejected normality')
    mean = numpy.mean(score)
    std_error = stats.sem(score)
    confidence_interval = stats.t.interval(confidence, len(score) - 1,
                                           loc=mean, scale=std_error)
    report.write_json('statistics.json', {
        'binaries': [args.team_a, args.team_b],
        'teams': team_names,
        'normality_p': normality_p,
        'score': [confidence_interval[0], mean, confidence_interval[1]],
        'score_std': std_error,
        'params': {
            'significance': args.significance,
        },
        'errors': errors,
    })

Example #24

0

Show file

File: sb_analyze.py Project: blub123muh/hci

def pearson_or_shapiro(data):
    """pearson_or_shapiro

    Use D'agostino/Pearson if possible (n >= 20), else Shapiro
    :param data:
    """
    return stats.normaltest(data) if len(data) >= 20 else stats.shapiro(data)

Example #25

0

Show file

File: test_morestats.py Project: GiladAmar/scipy

    def test_nan_input(self):
        x = np.arange(10.)
        x[9] = np.nan

        w, pw = stats.shapiro(x)
        assert_equal(w, np.nan)
        assert_almost_equal(pw, 1.0)

Example #26

0

Show file

File: test_normal.py Project: saschwan/botorch

 def test_MultivariateNormalQMCEngineDegenerate(self, cuda=False):
     device = torch.device("cuda") if cuda else torch.device("cpu")
     for dtype in (torch.float, torch.double):
         # X, Y iid standard Normal and Z = X + Y, random vector (X, Y, Z)
         mean = torch.zeros(3, device=device, dtype=dtype)
         cov = torch.tensor(
             [[1, 0, 1], [0, 1, 1], [1, 1, 2]], device=device, dtype=dtype
         )
         engine = MultivariateNormalQMCEngine(mean=mean, cov=cov, seed=12345)
         samples = engine.draw(n=2000)
         self.assertEqual(samples.dtype, dtype)
         self.assertEqual(samples.device.type, device.type)
         self.assertTrue(torch.all(torch.abs(samples.mean(dim=0)) < 1e-2))
         self.assertTrue(torch.abs(torch.std(samples[:, 0]) - 1) < 1e-2)
         self.assertTrue(torch.abs(torch.std(samples[:, 1]) - 1) < 1e-2)
         self.assertTrue(torch.abs(torch.std(samples[:, 2]) - math.sqrt(2)) < 1e-2)
         for i in (0, 1, 2):
             _, pval = shapiro(samples[:, i].cpu().numpy())
             self.assertGreater(pval, 0.9)
         cov = np.cov(samples.cpu().numpy().transpose())
         self.assertLess(np.abs(cov[0, 1]), 1e-2)
         self.assertLess(np.abs(cov[0, 2] - 1), 1e-2)
         # check to see if X + Y = Z almost exactly
         self.assertTrue(
             torch.all(
                 torch.abs(samples[:, 0] + samples[:, 1] - samples[:, 2]) < 1e-5
             )
         )

Example #27

0

Show file

File: phenotypeData.py Project: bvilhjal/mixmogam

 def most_normal_transformation(self, pid, trans_types=['none', 'sqrt', 'log', 'sqr', 'exp', 'arcsin_sqrt'],
             perform_trans=True, verbose=False):
     """
     Performs the transformation which results in most normal looking data, according to Shapiro-Wilk's test
     """
     #raw_values = self.phen_dict[pid]['values']
     from scipy import stats
     shapiro_pvals = []
     for trans_type in trans_types:
         if trans_type != 'none':
             if not self.transform(pid, trans_type=trans_type):
                 continue
         phen_vals = self.get_values(pid)
         #print 'sp.inf in phen_vals:', sp.inf in phen_vals
         if sp.inf in phen_vals:
             pval = 0.0
         else:
             r = stats.shapiro(phen_vals)
             if sp.isfinite(r[0]):
                 pval = r[1]
             else:
                 pval = 0.0
         shapiro_pvals.append(pval)
         #self.phen_dict[pid]['values'] = raw_values
         if trans_type != 'none':
             self.revert_to_raw_values(pid)
     argmin_i = sp.argmax(shapiro_pvals)
     trans_type = trans_types[argmin_i]
     shapiro_pval = shapiro_pvals[argmin_i]
     if perform_trans:
         self.transform(pid, trans_type=trans_type)
     if verbose:
         print "The most normal-looking transformation was %s, with a Shapiro-Wilk's p-value of %0.6f" % \
             (trans_type, shapiro_pval)
     return trans_type, shapiro_pval

Example #28

0

Show file

File: phenotype.py Project: timeu/PyGWAS

 def most_normal_transformation(self,trans_types=SUPPORTED_TRANSFORMATIONS,
             perform_trans=True, verbose=False):
     """
     Performs the transformation which results in most normal looking data, according to Shapiro-Wilk's test
     """
     from scipy import stats
     shapiro_pvals = []
     for trans_type in trans_types:
         if trans_type == 'most_normal':
             continue
         if trans_type != 'none':
             if not self.transform(trans_type=trans_type):
                 continue
         phen_vals = self.values
         #print 'sp.inf in phen_vals:', sp.inf in phen_vals
         if sp.inf in phen_vals:
             pval = 0.0
         else:
             r = stats.shapiro(phen_vals)
             if sp.isfinite(r[0]):
                 pval = r[1]
             else:
                 pval = 0.0
         shapiro_pvals.append(pval)
         if trans_type != 'none':
             self.revert_to_raw_values()
     argmin_i = sp.argmax(shapiro_pvals)
     trans_type = trans_types[argmin_i]
     shapiro_pval = shapiro_pvals[argmin_i]
     if perform_trans:
         self.transform(trans_type=trans_type)
     log.info("The most normal-looking transformation was %s, with a Shapiro-Wilk's p-value of %.2E" % \
             (trans_type, shapiro_pval))
     return trans_type, shapiro_pval

Example #29

0

Show file

File: normalitycalculation.py Project: TanerArslan/outlier-detection

 def distribution(self,gene,thresholdNorm):
     self.z,self.pval=stats.shapiro(gene[1:])
     if self.pval<thresholdNorm:
         #print 'not normal distribution'
         return self.pval
     else:
         #print'normal'
         return self.pval

Example #30

0

Show file

File: test_gaussianize.py Project: gregversteeg/gaussianize

def test_normality_increase_lambert():
    # Generate random data and check that it is more normal after inference
    for i, y in enumerate([np.random.standard_cauchy(size=ns), experimental_data]):
        print "Distribution %d" % i
        print "Before"
        print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(y)[0], shapiro(y)[0])).expandtabs(30)
        stats.probplot(y, dist="norm", plot=pylab)
        pylab.savefig("%d_before.png" % i)
        pylab.clf()

        tau = g.igmm(y)
        x = g.w_t(y, tau)
        print "After"
        print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(x)[0], shapiro(x)[0])).expandtabs(30)
        stats.probplot(x, dist="norm", plot=pylab)
        pylab.savefig("%d_after.png" % i)
        pylab.clf()

Example #31

0

Show file

def plot_boxplots(df):

    # %% boxplot chemotherapy
    fig, ax = plt.subplots(figsize=(12, 10))
    df_chemo = df.copy()
    df_chemo['Ablation Volume [ml] / Energy [kJ]'] = df_chemo[
        'Ablation Volume [ml]'] / df_chemo['Energy [kj]']
    df_chemo.dropna(subset=['Ablation Volume [ml] / Energy [kJ]'],
                    inplace=True)
    df_chemo.dropna(subset=['chemo_before_ablation'], inplace=True)
    df_chemo['chemo_before_ablation'].replace('No', False, inplace=True)
    df_chemo['chemo_before_ablation'].replace('Yes', True, inplace=True)

    df.dropna(subset=['Ablation Volume [ml]'], inplace=True)
    df.dropna(subset=['chemo_before_ablation'], inplace=True)
    df['chemo_before_ablation'].replace('No', False, inplace=True)
    df['chemo_before_ablation'].replace('Yes', True, inplace=True)
    # ttest
    no_chemo_df = df_chemo[df_chemo['chemo_before_ablation'] == False]
    no_chemo = no_chemo_df['Ablation Volume [ml]'].tolist()
    chemo_df = df_chemo[df_chemo['chemo_before_ablation'] == True]
    chemo = chemo_df['Ablation Volume [ml]'].tolist()

    fig, ax = plt.subplots(figsize=(12, 10))
    plt.hist(no_chemo)
    plt.title('No Chemotherapy')
    plt.ylabel('Ablation Volume [ml]')
    figpathHist = os.path.join("figures",
                               "histogram ablation volumes no chemo")
    gh.save(figpathHist, ext=['png'], close=True)
    fig1, ax = plt.subplots(figsize=(12, 10))
    plt.hist(chemo)
    plt.title('Chemotherapy')
    plt.ylabel('Ablation Volume [ml] ')
    figpathHist = os.path.join("figures", "histogram ablation volumes chemo")
    gh.save(figpathHist, ext=['png'], close=True)

    print('no of tumors with chemo:', str(len(chemo)))
    print('no of tumors with no chemo:', str(len(no_chemo)))
    #
    stat, p_chemo = shapiro(chemo)

    # interpret
    alpha_chemo = 0.05
    if p_chemo > alpha_chemo:
        msg = 'Sample Chemo looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample Chemo does not look Gaussian (reject H0)'
    print(msg)

    stat, p_no_chemo = shapiro(no_chemo)

    # interpret
    alpha_no_chemo = 0.05
    if p_no_chemo > alpha_no_chemo:
        msg = 'Sample No Chemo looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample No Chemo does not look Gaussian (reject H0)'
    print(msg)

    if p_no_chemo < alpha_no_chemo and p_chemo < alpha_chemo:
        t, p = stats.mannwhitneyu(chemo, no_chemo)
        print(
            'mann withney u test applied for samples coming from a non Gaussian distribution:'
        )
        print("t = " + str(t))
        print("p = " + str(p))
    else:
        t, p = stats.ttest_ind(chemo, no_chemo)
        print('ttest applied for samples coming from a Gaussian distribution:')
        print("t = " + str(t))
        print("p = " + str(p))

    fig, ax = plt.subplots(figsize=(12, 10))
    bp_dict = df.boxplot(column=['Ablation Volume [ml]'],
                         ax=ax,
                         notch=True,
                         by='chemo_before_ablation',
                         patch_artist=True,
                         return_type='both')
    ax.set_xlabel('')
    plt.show()
    for row_key, (ax, row) in bp_dict.iteritems():
        for i, box in enumerate(row['fliers']):
            box.set_marker('o')
        for i, box in enumerate(row['boxes']):
            if i == 0:
                box.set_facecolor('Purple')
                box.set_edgecolor('DarkMagenta')
            else:
                box.set_facecolor('LightPink')
                box.set_edgecolor('HotPink')
        for i, box in enumerate(row['medians']):
            box.set_color(color='Black')
            box.set_linewidth(2)
        for i, box in enumerate(row['whiskers']):
            box.set_color(color='Black')
            box.set_linewidth(2)
    xticklabels = [
        'No Chemotherapy before Ablation',
        'Chemotherapy Administered before Ablation'
    ]
    xtickNames = plt.setp(ax, xticklabels=xticklabels)
    plt.setp(xtickNames, fontsize=10, color='black')
    plt.ylim([-2, 120])
    plt.ylabel('Ablation Volume [ml]', fontsize=12, color='k')
    plt.tick_params(labelsize=10, color='black')
    ax.tick_params(colors='black', labelsize=10, color='k')
    ax.set_ylim([-2, 120])
    plt.xlabel('')
    fig.suptitle('')
    plt.title('')
    # plt.title('Comparison of Ratio (Ablation Volumes [ml] : Energy [kJ]) from MAVERRIC Dataset by Chemotherapy', fontsize=12)
    plt.title(
        'Comparison of Ablation Volumes [ml] from MAVERRIC Dataset by Chemotherapy',
        fontsize=12)
    figpathHist = os.path.join(
        "figures", "boxplot ablation volumes by chemo before ablation")
    gh.save(figpathHist, ext=['png'], close=True)

    # %% BOXPLOTS ABLATION VOLUMES

    # ttest
    df_volumes = df.copy()
    df_volumes.dropna(subset=['Ablation Volume [ml]'], inplace=True)
    df_volumes.dropna(subset=['Ablation Volume [ml] (manufacturers)'],
                      inplace=True)
    ablation_vol = df_volumes['Ablation Volume [ml]'].tolist()
    ablation_vol_brochure = df_volumes[
        'Ablation Volume [ml] (manufacturers)'].tolist()

    stat, p_brochure = shapiro(ablation_vol_brochure)
    # interpret
    alpha_brochure = 0.05
    if p_brochure > alpha_brochure:
        msg = 'Sample Ablation Volume Brochure looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample Ablation Volume Brochure does not look Gaussian (reject H0)'
    print(msg)

    stat, p_voxel = shapiro(ablation_vol)
    # interpret
    alpha_voxel = 0.05
    if p_voxel > alpha_voxel:
        msg = 'Sample Ablation Volume looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample Ablation Volume does not look Gaussian (reject H0)'
    print(msg)

    if p_voxel < alpha_voxel and p_brochure < alpha_brochure:
        t, p = stats.mannwhitneyu(ablation_vol, ablation_vol_brochure)
        print(
            'mann withney u test applied for samples coming from a non Gaussian distribution:'
        )
        print("t = " + str(t))
        print("p = " + str(p))
    else:
        t, p = stats.ttest_ind(ablation_vol, ablation_vol_brochure)
        print('ttest applied for samples coming from a Gaussian distribution:')
        print("t = " + str(t))
        print("p = " + str(p))

    fig, ax = plt.subplots(figsize=(12, 10))
    bp_dict = df.boxplot(column=[
        'Ablation Volume [ml]', 'Ablation Volume [ml] (parametrized_formula)',
        'Ablation Volume [ml] (manufacturers)'
    ],
                         ax=ax,
                         notch=True,
                         patch_artist=True,
                         return_type='both')
    ax.set_xlabel('')
    row = bp_dict.lines
    # for idx,row in enumerate(lines):
    for i, box in enumerate(row['fliers']):
        box.set_marker('o')
        # box.set_edgecolor('RoyalBlue')
    for i, box in enumerate(row['boxes']):
        if i == 0:
            box.set_facecolor('Blue')
            box.set_edgecolor('MediumBlue')
        elif i == 1:
            box.set_facecolor('BlueViolet')
            box.set_edgecolor('BlueViolet')
        elif i == 2:
            box.set_facecolor('DeepSkyBlue')
            box.set_edgecolor('DodgerBlue')

    for i, box in enumerate(row['medians']):
        box.set_color(color='Black')
        box.set_linewidth(2)
    for i, box in enumerate(row['whiskers']):
        box.set_color(color='Black')
        box.set_linewidth(2)

    xticklabels = [
        'Ablation Volume [ml] (Voxel-Based)',
        'Ablation Volume [ml] (Ellipsoid Formula)',
        'Ablation Volume [ml] (Manufacturers Brochure)'
    ]
    xtickNames = plt.setp(ax, xticklabels=xticklabels)
    plt.setp(xtickNames, fontsize=10, color='black')
    plt.ylim([-2, 150])
    plt.ylabel('Ablation Volume [ml]', fontsize=14, color='k')
    plt.tick_params(labelsize=10, color='black')
    ax.tick_params(colors='black', labelsize=10, color='k')
    ax.set_ylim([-2, 150])
    plt.title('Comparison of Ablation Volumes [ml] from MAVERRIC Dataset',
              fontsize=16)
    figpathHist = os.path.join("figures", "boxplot volumes")
    gh.save(figpathHist, ext=['png'], close=True)

Example #32

0

Show file

def nortest(df, a):
    _, sw = shapiro(df[a])
    _, ap = normaltest(df[a])
    index = ['Shapiro-Wilk', 'D\'Agostino-Pearson']
    columns = ['p-value']
    return pd.DataFrame([ap, sw], index=index, columns=columns)

Example #33

0

Show file

File: cats.py Project: JKrymarys/sem1

df = pd.read_csv('datasets/cats-data.csv', sep=",", index_col=0)
print(df)

df_female = df[df["Sex"] == "F"]
df_male = df[df["Sex"] == "M"]


def test_normal_distribution(p_group, alpha):
    return p_group > alpha


def test_hipothesis(p, alpha):
    return p > alpha


W, p_female = st.shapiro(df_female["Hwt"])
print('For female cats normal distribution test result is:',
      test_normal_distribution(p_female, alpha))
W, p_male = st.shapiro(df_male["Hwt"])
print('For male cats normal distribution test result is:',
      test_normal_distribution(p_male, alpha))

t, p = st.ttest_ind(df_female["Hwt"], df_male["Hwt"])
hypothesis_result = test_hipothesis(p, alpha)
print("Hipothesis that Heart weight for male and female heart is equal is: ",
      hypothesis_result)


def display_hist(data_female, data_male):
    data.plot.hist(bins=40)
    plt.legend(loc="upper right")

Example #34

0

Show file

File: AB_testing_facebook_bidding_type.py Project: alibaltaci/A-B-Testing-Facebook-Bidding-Type

# H0 : M1=M2 ("There is no statistically significant difference between the Purchase averages of the two groups.")
# H1: M1 != M2 ("There is a statistically significant difference between the Purchase averages of the two groups.")
"""

# 2. Assumption Control

# 2.1. Normality Assumption (shapiro)
# Shapiro Wilk Test is used for the assumption of normality.
"""
# Defining hypothesis theses for the assumption of normality.
# H0 : Normality assumption is provided for this sample.
# H1 : Normality assumption is not provided for this sample.
"""

hf.hypothesis_test(
    shapiro(A))  # P-value = 0.5891, so that H0 can NOT be REJECTED!
hf.hypothesis_test(
    shapiro(B))  # P-value = 0.1541, so that H0 can NOT be REJECTED!

#Normality assumption is provided for both samples.

# 2.2 Variance Homogenity Assumption (levene)
"""
# Defining hypothesis theses for the Variance Homogenity Assumption.
# H0 : Variance homogeneity assumption is provided.
# H1 : Variance homogeneity assumption is NOT provided.
"""

hf.hypothesis_test(stats.levene(
    A, B))  # P-value = 0.1083, so that H0 can NOT be REJECTED!

Example #35

0

Show file

File: main.py Project: lucaslazzaris/Aceleradev-Data-Science

def q1():
    # Retorne aqui o resultado da questão 1.
    p_value = sct.shapiro(sample_height)[1]
    return bool(p_value > 0.05)

Example #36

0

Show file

def q1():
    _, pvalue = sct.shapiro(get_sample(athletes, 'height', n=3000))
    return pvalue > 0.05

Example #37

0

Show file

File: Interaction.py Project: michaelgmz/MRes-2020DS-Project2

                interaction_len.append(max(times) if len(times) == 2 else sum(times[1:]))

        interaction_seq.append(interaction_count)
        appear_seq.append(appear_count)

    return [interaction_seq[i] / appear_seq[i] for i in range (len(interaction_seq))], \
        interaction_len

inter_per_class = {0:[], 1:[]}
propor_b, len_b = interact_length(inter_before, labels[:197])
propor_a, len_a = interact_length(inter_after, labels[197:])

# %%
# Normality | Non-parametric tests
import scipy.stats as stats
statistics_b, pvals = stats.shapiro(propor_b)
print (f'p-value (Shapiro Before): {pvals}, S: {statistics_b}')
print (f'df: {len(propor_b)}')

statistics_a, pvals = stats.shapiro(propor_a)
print (f'p-value (Shapiro After): {pvals}, S: {statistics_a}')
print (f'df: {len(propor_a)}')

statistics_u, pvals = stats.mannwhitneyu(propor_b, propor_a, alternative = 'less')
print ('p-value (Mann-Whitney U test): \t', pvals)
print (f'Before: {np.median(propor_b)}, After: {np.median(propor_a)}')
print (f'U: {statistics_u}')

# %%
df = pd.DataFrame(columns = ['Proportion', 'Treatment', 'Color'])
df['Treatment'] = ['Dataset 1'] * len(propor_b) + ['Dataset 2'] * len(propor_a)

Example #38

0

Show file

File: main.py Project: zhaizhch/Deep_Learning_Prediction_Intervals

         y_pred_L = gauss_to_pi(y_pred_gauss_mid_all, y_pred_gauss_dev_all, n_std_devs)

    # work out metrics
    y_U_cap = y_pred_U > y_val.reshape(-1)
    y_L_cap = y_pred_L < y_val.reshape(-1)
    y_all_cap = y_U_cap * y_L_cap
    PICP = np.sum(y_all_cap) / y_L_cap.shape[0]
    MPIW = np.mean(y_pred_U - y_pred_L)
    y_pred_mid = np.mean((y_pred_U, y_pred_L), axis=0)
    MSE = np.mean(np.square(Gen.scale_c * (y_pred_mid - y_val[:, 0])))
    RMSE = np.sqrt(MSE)
    CWC = np_QD_loss(y_val, y_pred_L, y_pred_U, alpha, soften, lambda_in)
    neg_log_like = gauss_neg_log_like(y_val, y_pred_gauss_mid,
                                      y_pred_gauss_dev, Gen.scale_c)
    residuals = residuals = y_pred_mid - y_val[:, 0]
    shapiro_W, shapiro_p = stats.shapiro(residuals[:])
    results_runs.append(
        (PICP, MPIW, CWC, RMSE, neg_log_like, shapiro_W, shapiro_p))

    # concatenate for graphs
    title = 'PICP=' + str(round(PICP,3))\
       + ', MPIW=' + str(round(MPIW,3)) \
       + ', qd_loss=' + str(round(CWC,3)) \
       + ', NLL=' + str(round(neg_log_like,3)) \
       + ', alpha=' + str(alpha) \
       + ', loss=' + NN.loss_type \
       + ', data=' + type_in + ',' \
       + '\nh_size=' + str(NN.h_size) \
       + ', bstraps=' + str(n_bootstraps) \
       + ', ensemb=' + str(n_ensemble) \
       + ', RMSE=' + str(round(RMSE,3)) \

Example #39

0

Show file

        n, then a must have length n/2.
    reta : bool, optional
        Whether or not to return the internally computed a values.  The
        default is False.
    
    Returns
    -------
    W : float
        The test statistic.
    p-value : float
        The p-value for the hypothesis test.
    a : array_like, optional
        If `reta` is True, then these are the internally computed "a"
        values that may be passed into this function on future calls.
"""
from scipy import stats
from matplotlib.finance import quotes_historical_yahoo
import numpy as np

ticker = 'IBM'
begdate = (2009, 1, 1)
enddate = (2013, 12, 31)
p = quotes_historical_yahoo(ticker,
                            begdate,
                            enddate,
                            asobject=True,
                            adjusted=True)
ret = (p.aclose[1:] - p.aclose[:-1]) / p.aclose[1:]
print 'ticker=', ticker, 'W-test, and P-value'
print stats.shapiro(ret)

Example #40

0

Show file

### Normality Tests
##### Histogram
In repository - Python-DataScience-CookBook/Exploratory Data Analysis.py
import seaborn as sns
sns.distplot(Df.Var.dropna())
##### Q-Q Plot
import numpy as np 
import pylab 
import scipy.stats as stats
stats.probplot(Df.Var, dist="norm", plot=pylab)
pylab.show()
##### Normal Test
k2, p = stats.normaltest(Energy.x) # k2 value corresponds to statistic value & p-value>0.05 implies data is normally distribution
##### Shapiro-Wilk Test - https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html
from scipy import stats
w,p = stats.shapiro(Df.Var)
##### D’Agostino’s K^2 Test - Kolmogorov-Smirnov test for goodness of fit
stats.kstest(Df.Var,'norm')
##### Anderson-Darling Test
stats.anderson((Df.Var,'norm')

### Correlation Tests
# H0: Two samples are independent
# H1: There is a dependency between the samples              
##### Pearson’s Correlation Coefficient
corr, p = pearsonr(Df.Var1, Df.Var2)       
##### Spearman’s Rank Correlation
corr, p = spearmanr(Df.Var1, Df.Var2)                    
##### Kendall’s Rank Correlation
corr, p = kendalltau(Df.Var1, Df.Var2)

Example #41

0

Show file

File: 1_Pearson_Spearman_Kendall.py Project: blacksevenzqj/zoubo

df = df.dropna()
df = df.iloc[:, 2:].apply(
    lambda x: x.astype(str).str.replace(',', '.').astype(float))

# 检验人均GDP和手机使用率的相关性
sns.regplot(x='GDP ($ per capita)', y='Phones (per 1000)', data=df)
plt.show()
'''
看起来有些像是线性关系，但是方差随着变量的值有所变动，看起来并不是同方差。另外，我们得检验一下，两个变量是不是接近正态分布的。
Scipy.stats中有多个方法可以用来检验正态分布，比如normaltest() 、shapiro()、kstest(rvs='norm')等，这里我们选用shapiro()，
分别检验各国人均GDP和手机使用率是否符合正态分布。
原假设：样本来自一个正态分布的总体。
备选假设：样本不来自一个正态分布的总体。
'''
print(stats.shapiro(df['GDP ($ per capita)']))
# (0.8052586317062378, 3.5005310282387736e-14)
print(stats.shapiro(df['Phones (per 1000)']))
# (0.8678628206253052, 2.0484371143769664e-11)
# 返回的结果是一个包含统计量w和p-值的元组。可以看到，p-值非常小，接近于0，于是可以拒绝原假设。
# 我们认为各国人均GDP和手机使用率都不符合正态分布。

# 用Pandas计算相关系数
'''
低度相关：0 <= |r| <= 0.3
中度相关：0.3 <= |r| <= 0.8
高度相关：0.8 <= |r| <= 1
'''
# 因 各国人均GDP和手机使用率都不符合正态分布，所以 不适用皮尔森相似度pearson
print(df['GDP ($ per capita)'].corr(df['Phones (per 1000)'], method='pearson'))
# 0.88352010541116632

Example #42

0

Show file

File: Assignment - Hypothesis Testing.py Project: jzsimar/DataScienceCourse

###########################1###########################

import pandas as pd
import scipy 
from scipy import stats

cutlets=pd.read_csv("C:\\Users\\jzsim\\Downloads\\Cutlets.csv")

#as there are 2 population here comparion with each other.
# checking if both are following normal distribtuion or not.
# doing the same by shapiro test
#H0 : Follworing normal distribution
#Ha : Not Follworing normal distribution
print(stats.shapiro(cutlets['Unit A'])) 
# p Value: 0.3199819028377533
#as P value is greater than 0.05
# P high Null Fly
print(stats.shapiro(cutlets['Unit B'])) 
# p Value: 0.3199819028377533
#as P value is greater than 0.05
# P high Null Fly

#AS BOTH P VALUES ARE GREATER THAN 0.05 P HIGH NULL FLY 
#DATA IS FOLLWOING NORMAL DISTRIBUTION
 
#are external conditions same --> No


# Checking Variances are equal or not
#H0 : VAriances are equal

Example #43

0

Show file

File: REIripperCONSOL.py Project: wha7/PEST_utilities

def resid_proc(reis, remove_zero_wt, grpfiles, pareto, groups_rei):
    print "aggregating statistics and plotting by observation group..."
    print "PEST iteration:"
    for cf in reis:
        print '{0} '.format(cf),
        infile = reis[cf]

        # open a pointer to the output file
        rei_summary_folder='residuals_summaries'
        if not os.path.exists(rei_summary_folder):
            os.makedirs(rei_summary_folder)
            
        ofp = open(os.path.join(rei_summary_folder,infile + '_residuals_summary.dat'),'w')
        ofp.write('Residuals Summary information for -> ' + infile + '\n')

        # read in the data
        alldat = np.genfromtxt(infile,names=True,skip_header=4,dtype=None)
                # if processing PEST pareto results, read in groups from another REI
        if pareto:
            try:
                rei_groups_df = pd.read_csv(groups_rei, delim_whitespace=True, skiprows=6, index_col='Name')
                #if np.isnan(np.max(rei_groups_df.ix[:,0])):
                    #rei_groups_df = rei_groups_df[rei_groups_df.columns[1:]]
                    # for observations that were read in, reassign the entry in 'Group' column to group from other REI
                for observation in alldat:
                    observation['Group'] = rei_groups_df.ix[observation['Name'], 'Group']

            except IOError:
                print "Cannot open {0}. Please provide an non-pareto REI file so that observations can be analyzed by group."
                quit()
        # find the unique list of groups by which plots and stats will be managed
        allgrps = np.unique(alldat['Group'])
        allgrps = [g for g in allgrps if 'regul' not in g]    
        
        # loop over the groups
        for cg in allgrps:
            # identify indices of the current group
            tmpinds = np.nonzero(alldat['Group']==cg)[0]
            if remove_zero_wt:
                inds = tmpinds[np.nonzero(alldat['Weight'][tmpinds] != 0)]
                
                # not sure what the "remove_zero_weight" option is for, but for groups
                # that are zero weighted, it results in an empty "inds" array, causing python to crash
                if len(inds)==0:
                    inds = tmpinds
            else:
                inds = tmpinds
            # pull out the measured values for the group
            cmeas = alldat['Measured'][inds]
            # pull out the modeled values for the group
            cmod =  alldat['Modelled'][inds]
            
            #get some values to limit plotting areas
            try:
                cmin = np.min([cmeas,cmod])
                cmax = np.max([cmeas,cmod])
            # if the last rei is from an iteration where PEST failed, will have unreasonable values (i.e. -1e300)
            # that will cause a TypeError here
            except TypeError:
                continue


        
            # now calculate statistics on the residuals
            
            # first grab the residuals
            cres = alldat['Residual'][inds]
            
            # next calculate the relevant statistics and write to the output file
            cmean = np.mean(cres)
            cstd  = np.std(cres)
            cvar  = np.var(cres)
            cmed  = np.median(cres)
            cmin  = np.min(cres)
            camin = np.min(np.abs(cres))
            cmax  = np.max(cres)
            camax = np.max(np.abs(cres))

            if len(grpfiles) > 1:
            # make a plot of modeled vs. measured
                plt.figure()
                plt.hold = True

                plt.plot(cmeas,cmod,'bx')
                plt.plot([cmin,cmax],[cmin,cmax],'r')
                plt.title('Observation Group "%s", PEST iteration %s' %(cg, cf))
                plt.xlabel('Measured')
                plt.ylabel('Modeled')
                # append the histograms into the proper PDF file
                grpfiles[cg][0].savefig()
                #plt.close()

            # finally plot the histogram and save it
            fig = plt.figure()
            ax = fig.add_subplot(111)
            n, bins, patches = ax.hist(cres, 50, facecolor='blue', alpha=0.75)
            ax.set_xlabel('Residual Value')
            ax.set_ylabel('Count')
            ax.set_title(cg + ' iteration ' + str(cf))
            ax.set_xlim([cmin,cmax])
            # append the histograms into the proper PDF file
            grpfiles[cg][-1].savefig()
            #plt.close()

            # perform the Shapiro-Wilks test for normality of the residuals
            if len(cres)>2:
                W,p = shapiro(cres)
            
            if len(cres) > 2:
                W,p = shapiro(cres)
            else:
                p = -99999
                
            # write to the summary output file
            ofp.write(25*'#' + '\n')
            ofp.write('Summary Statistics for Residuals: -> group ' + cg +'\n')
            ofp.write('%14s : %f\n' %('mean',cmean))
            ofp.write('%14s : %f\n' %('median',cmed))
            ofp.write('%14s : %f\n' %('std deviation',cstd))
            ofp.write('%14s : %f\n' %('variance',cvar))
            ofp.write('%14s : %f\n' %('min',cmin))
            ofp.write('%14s : %f\n' %('max',cmax))
            ofp.write('%14s : %f\n' %('min (absolute)',camin))
            ofp.write('%14s : %f\n' %('max (absolute)',camax))
    
            if p > 0.05:
                ofp.write('Residuals are not normally distributed\n')
            else:
                ofp.write('Residuals are normally distributed\n')
            ofp.write('p-value = %f' %(p))
            
            if p > 0.05:
                ofp.write('Residuals are not normally distributed\n')
                ofp.write('p-value = %f' %(p))
            elif p < -99:
                ofp.write('Residuals normality not calculable: Too few residuals in group\n')
            else:
                ofp.write('Residuals are normally distributed\n')
                ofp.write('p-value = %f' %(p))
    
            ofp.write(3*'\n')
        ofp.close()
    # close the PDF files
    for cg in grpfiles:
        for i in range(len(grpfiles)):
            grpfiles[cg][i].close()

Example #44

0

Show file

#Superposition des lignes de régression chez les Oecanthus exclamationis (rouge) et chez les Oecanthus niveus (bleu)
ax1 = sns.regplot(x="TempEx", y="ImpulsionEx", data=Crickets, color='r')
ax2 = sns.regplot(x="TempNiv", y="ImpulsionNiv", data=Crickets, color='b')

# La ligne de régression pour Oecanthus exclamationis est plus élevée que la ligne pour Oecanthus niveus; cela signifie que Oecanthus exclamationis aurait un taux de pouls plus élevé à n’importe quelle température.

# La première hypothèse nulle de l’ancova est que les pentes des lignes de régression sont toutes égales; en d’autres termes, que les lignes de régression sont parallèles les unes aux autres. On va acceptez l’hypothèse nulle selon laquelle les lignes de régression sont parallèles et nous testerons la deuxième hypothèse nulle : que les interceptions des lignes de régression sont toutes les mêmes.

# Les pentes ne sont pas significativement différentes (P=0,25); la pente commune est de 3,60, ce qui se trouve entre les pentes pour les lignes séparées (3,52 et 3,75). Sur cette partie-là, je n'ai pas réussi à tester cette hypothèse.

# Ancova fait les mêmes hypothèses que la régression linéaire : normalité et homoscédasticité de Y pour chaque valeur de X, et indépendance. Vérifions au moins l'hypothèse de normalité.

# In[202]:

#Test de Shapiro chez les Oecanthus exclamationis
stats.shapiro(model1.resid)

# W= 0.9727, p= 0.9105 donc les résidus sont bien distribués suivant la loi normale chez les Oecanthus exclamationis

# In[203]:

#Test de Shapiro chez les Oecanthus niveus
stats.shapiro(model2.resid)

# W= 0.9159, p= 0.1259 donc les résidus sont bien distribués suivant la loi normale chez les Oecanthus niveus

# Maintenant procédons à un test de Tukey sous l'hypothèse que leurs pentes sont toutes les mêmes

# In[188]:

from statsmodels.stats.multicomp import pairwise_tukeyhsd

Example #45

0

Show file

File: meta_process.py Project: kotsaloscv/PLTs-FatTails

def meta_process(tau):
    '''
    Main processing kernel
    '''
    print('Analyzing tau (ms): ', tau)

    import warnings
    warnings.filterwarnings('ignore')

    # Folder where you store the PLT positions (center of mass - COM) per DNS time steps
    data_location = which_bodies + '_tau_' + str(tau) + '/'

    numBodies = 0
    Bodies = []
    for log in sorted(glob.glob(data_location + '*_ComPos.log'), key=numericalSort):
        Bodies.append(numBodies)
        numBodies += 1

    # Build the data frames and fill them
    absolute_pos  = pd.DataFrame(columns=Bodies, dtype=np.float64)
    distFromWalls = pd.DataFrame(columns=Bodies, dtype=np.float64)
    MSD           = pd.DataFrame(columns=Bodies, dtype=np.float64)
    
    # Perform distributions checking in zones
    zones_vels          = []
    zones_distros       = []
    zones_MSD           = []
    zones_distFromWalls = []
    for z in range(zones_):
        zones_distros.append(np.array([], dtype=np.float64))
        zones_vels.append(np.array([], dtype=np.float64))
        zones_MSD.append(np.array([], dtype=np.float64))
        zones_distFromWalls.append(np.array([], dtype=np.float64))

    # Var that help us find the mean free path/ time (MFP/T) in comparison with the ground truth (gT, path from DNS)
    integrals_tau = []
    
    numBodies = 0
    for log in sorted(glob.glob(data_location + '*_ComPos.log'), key=numericalSort):

        df = pd.read_csv(log, delimiter=',', header=None, names=names_, usecols=usecols_, dtype={'t': np.float64, 'y': np.float64, 'z': np.float64})

        # Time in the original files interprets to how many DNS fluid time steps,
        # this is why we multiply here with DNS fluid time step to convert it into physical time in ms
        df = df.loc[df['t']*dt_f >= From_]
        df = df.loc[df['t']*dt_f <= To_]
        df = df.reset_index(drop=True)

        absolute_pos[numBodies] = df['y'].copy()

        if (do_what == 'MFP'):
            integrals_tau.append(np.trapz(df['y'], df['t']*dt_f))
        
        if ( (do_what == 'distros') or (do_what == 'MSD') or (do_what == 'distFromWalls') ):

            MSD[numBodies] =  pd.Series((df['y'] - df['y'].iloc[0]) * (df['y'] - df['y'].iloc[0]))

            distFromWalls[numBodies] = df['y'].apply(lambda y: (y - bottom_wall) if ( (y - bottom_wall) < (top_wall - y) ) else (top_wall - y))

            pos        = absolute_pos[numBodies].to_numpy()
            pos_rolled = np.roll(pos, 1)

            # velocity in um/ms
            vel = (pos - pos_rolled) / tau
            vel[0] = np.nan
            
            # Exclude erroneous jumps
            dp = np.absolute(pos-pos_rolled)
            inds = np.where(dp < ((top_wall - bottom_wall) - 5.0))
            pos = pos[inds]
            vel = vel[inds]
            
            inds = np.where((~np.isnan(vel)) & (~np.isinf(vel)))
            pos = pos[inds]
            vel = vel[inds]

            zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1)
            for z in range(zones_):
                inds = np.where((pos >= zones_tmp[z]) & (pos < zones_tmp[z+1]))
                zones_distros[z] = np.append(zones_distros[z], vel[inds])
        
        numBodies += 1
    #######################################################################

    #######################################################################
    df_t = np.arange(From_, To_+tau, tau)
    #######################################################################

    #######################################################################
    # Compute MSD & distFromWalls per Zone
    if ( (do_what == 'distros') or (do_what == 'MSD') or (do_what == 'distFromWalls') ):

        zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1)
        MSD_t_avg = []
        distFromWalls_t_avg = []
        for z in range(zones_):
            MSD_t_avg.append([])
            distFromWalls_t_avg.append([])

        for i, t_ in enumerate(df_t):
            for b_ in range(len(Bodies)):
                try:
                    pos = absolute_pos[b_].iloc[i]
                except:
                    continue

                for z in range(zones_):
                    if ( (pos >= zones_tmp[z]) and (pos < zones_tmp[z+1]) ):
                        MSD_t_avg[z].append(MSD[b_].iloc[i])
                        distFromWalls_t_avg[z].append(distFromWalls[b_].iloc[i])
            
            for z in range(zones_):
                # If no particles in the zone, then np.mean returns nan
                zones_MSD[z] = np.append(zones_MSD[z], np.mean(MSD_t_avg[z]))
                MSD_t_avg[z] = []

                zones_distFromWalls[z] = np.append(zones_distFromWalls[z], np.mean(distFromWalls_t_avg[z]))
                distFromWalls_t_avg[z] = []
        
        # Cleaning
        for z in range(zones_):
            if (np.where(np.isnan(zones_MSD[z]))[0].shape[0] != 0):
                zones_MSD[z]           = zones_MSD[z][:np.where(np.isnan(zones_MSD[z]))[0][0]]
            if (np.where(np.isnan(zones_distFromWalls[z]))[0].shape[0] != 0):
                zones_distFromWalls[z] = zones_distFromWalls[z][:np.where(np.isnan(zones_distFromWalls[z]))[0][0]]

        for z in range(zones_):
            
            from scipy import optimize
            # non_linear fitting
            def non_linear_(x, a, b):
                return a*np.power(x, b)
            # linear fitting
            def linear_(x, a, b):
                return a*x + b

            # MSD
            Y = zones_MSD[z]
            X = np.copy(df_t)[:Y.shape[0]]
            X -= From_

            best_vals_non_linear, _ = optimize.curve_fit(non_linear_, X, Y)
            #best_vals_linear    , _ = optimize.curve_fit(linear_    , X, Y)

            zones_MSD[z] = tuple(best_vals_non_linear)

            if (do_what == 'MSD'):
                # Dump data
                #np.savetxt(fName_ + '.csv', np.array(list(zip(X,Y))), delimiter=',')
                plt.plot(X,Y)
                plt.plot(X, non_linear_(X, *best_vals_non_linear), linestyle='--', label='non-linear (a*x^b), params as [a,b] : '+str(best_vals_non_linear))
                #plt.plot(X, linear_(X, *best_vals_linear)        , linestyle='--', label='linear (a*x + b), params as [a,b] : '+str(best_vals_linear))
                plt.legend()
                plt.show()


            # distFromWalls
            Y = zones_distFromWalls[z]
            X = np.copy(df_t)[:Y.shape[0]]
            X -= From_
            
            #best_vals_non_linear, _ = optimize.curve_fit(non_linear_, X, Y)
            best_vals_linear    , _ = optimize.curve_fit(linear_    , X, Y)
            
            zones_distFromWalls[z] = tuple(best_vals_linear)

            if (do_what == 'distFromWalls'):
                # Dump data
                #np.savetxt(fName_ + '.csv', np.array(list(zip(X,Y))), delimiter=',')
                plt.plot(X,Y)
                #plt.plot(X, non_linear_(X, *best_vals_non_linear), linestyle='--', label='non-linear (a*x^b), params as [a,b] : '+str(best_vals_non_linear))
                plt.plot(X, linear_(X, *best_vals_linear)        , linestyle='--', label='linear (a*x + b), params as [a,b] : '+str(best_vals_linear))
                plt.legend()
                plt.show()
    #######################################################################

    #######################################################################
    if (do_what == 'distros'):

        # significance level for p-values
        sign_lvl = 0.1

        # For the PLT random walk simulations
        distros_invECDF = []
        distros_tail    = []
        xmins           = []

        zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1)
        for z in range(zones_):
            print("#######################################################################")
            print("Zone ", z)
            print("Limits: (", zones_tmp[z], ",", zones_tmp[z+1], ")")
            print('------------------------------------------------------------')

            data = np.absolute(zones_distros[z])
            print("Mean absolute velocity (current zone) [um/ms]                             : ", np.mean(data))
            print("Diffusion Coefficient (v^2*dt*0.5) [um^2/ms]                              : ", (np.mean(data)**2.)*tau*0.5)
            print("MSD non-linear fitting (a*x^b), params as (a,b) [um^2,ms]                 : ", zones_MSD[z])
            print("Avg Distance from Walls linear fitting (a*x + b), params as (a,b) [um,ms] : ", zones_distFromWalls[z])

            print('------------------------------------------------------------')
            print("Checking for sign.")
            data = zones_distros[z]

            sign_ = np.sign(data)
            positive_ = sign_[sign_ > 0.]
            negative_ = sign_[sign_ < 0.]

            print('Positive velocities (%) : ' , round(positive_.shape[0]/sign_.shape[0], 2) * 100.)
            print('Negative velocities (%) : ' , round(negative_.shape[0]/sign_.shape[0], 2) * 100.)

            print('------------------------------------------------------------')
            print("Checking for normality.")
            
            not_normal = 0
            normal     = 0

            # Shapiro-Wilk Test
            stat, p = stats.shapiro(data)
            if (p > sign_lvl):
                normal += 1
            else:
                not_normal += 1

            # D’Agostino’s K^2 Test
            stat, p = stats.normaltest(data)
            if (p > sign_lvl):
                normal += 1
            else:
                not_normal += 1

            # Anderson-Darling Test
            result = stats.anderson(data)
            for i in range(len(result.critical_values)):
                if result.statistic < result.critical_values[i]:
                    normal += 1
                else:
                    not_normal += 1

            kurt = stats.kurtosis(data)
            print('kurtosis of dataset (whole range, i.e., body & tail) : ', kurt)
            print('Number of successful normality tests                 : ', normal)
            print('Number of failed normality tests                     : ', not_normal)

            print("End of Checking for normality.")

            print('------------------------------------------------------------')
            print("Analyze the tail of the distribution.")
            
            data = np.absolute(zones_distros[z])

            from statsmodels.distributions.empirical_distribution import ECDF, monotone_fn_inverter
            data.sort() # in-place sorting
            ecdf = ECDF(data)
            inv_ecdf = monotone_fn_inverter(ecdf, data)
            distros_invECDF.append({'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))})
            
            #######################################################################
            tail_P = 0.90 # no need to search the whole domain for the lower bound (x_min). Search from the 90th percentile and above.
            print("Number of samples to do statistics (whole range, i.e., body & tail) : ", data.shape[0])
            print("Number of samples to do statistics (tail-only)                      : ", data[data >= inv_ecdf(tail_P)].shape[0])
            #######################################################################

            print('------------------------------------------------------------')
            # https://en.wikipedia.org/wiki/Heavy-tailed_distribution#Common_heavy-tailed_distributions
            # We focus on fat-tails and more specifically on power laws (see paper for more)
            # heavy-tails term: kept it for legacy reasons
            wikipedia_heavy_tailed_distros = [
                'halfcauchy',
                'burr12', 'burr',
                'pareto',
                'lognorm',
                'weibull_min',
                'fisk',
                'invweibull',
                'levy',
                'invgauss' # see Klaus_2011 (Statistical Analyses Support Power Law Distributions Found in Neuronal Avalanches)
            ]

            handpicked_distros = wikipedia_heavy_tailed_distros + ['expon', 'halfnorm']

            for dist_name in handpicked_distros:
                print(dist_name)
                distro = eval('stats.' + dist_name)

                '''
                if (distro.numargs >= 2):
                    print('Skip distro.')
                    print('Avoid overfitting from distros with multiple parameters (numargs >= 2).')
                    print('------------------------------------------------------------')
                    continue
                '''

                if ( (distro.a < 0.) or (distro.b != np.inf) ):
                    print('Skip distro.')
                    print('Bounds not appropriate.')
                    print('------------------------------------------------------------')
                    continue

                #######################################################################
                # Optimal fitting
                # Computationally expensive part!
                if (dist_name != 'halfnorm'):
                    xmin_optimal = find_xminOpt_distro(data[data >= inv_ecdf(tail_P)], dist_name)
                else:
                    xmin_optimal = 0.
                #######################################################################

                #######################################################################
                # Relaxed fitting based on optimal one
                # When ecdf(xmin_opt) > 95%, it's a good idea to try a relaxed version
                # at ecdf(xmin_opt) ~ 90%
                if (dist_name != 'halfnorm'):
                    # round down
                    tail_i = 0.04
                    xmin_relaxed_lb = inv_ecdf( (int(ecdf(xmin_optimal)*10.) / 10.) - tail_i/2. )
                    xmin_relaxed_ub = inv_ecdf( (int(ecdf(xmin_optimal)*10.) / 10.) + tail_i/2. )
                    # More educated choice of xmin_relaxed
                    xmin_relaxed = find_xminOpt_distro(data[data >= xmin_relaxed_lb], dist_name, xmin_relaxed_ub)
                else:
                    xmin_relaxed = 0.
                #######################################################################

                data_optimal = data[data >= xmin_optimal]
                params_optimal = distro.fit(data_optimal)

                data_relaxed = data[data >= xmin_relaxed]
                params_relaxed = distro.fit(data_relaxed)

                #*** KS-test
                p_val_optimal = stats.kstest(data_optimal, dist_name, params_optimal)[1]
                p_val_relaxed = stats.kstest(data_relaxed, dist_name, params_relaxed)[1]
                #***

                strongly_rejected_opt = False
                negative_d = 'None'
                negative_p = 1.
                for dist_name_ in handpicked_distros:
                    if (dist_name_ == dist_name):
                        continue
                    # Check dist_name vs dist_name_
                    # Which model is better fit
                    LLR, p = LLR_test(data_optimal, dist_name, dist_name_)
                    if ( (LLR < 0.) and (p < negative_p) ):
                        negative_d = dist_name_
                        negative_p = p

                # significance lvl as in Klaus_2011 (Statistical Analyses Support Power Law Distributions Found in Neuronal Avalanches)
                if ( negative_p < 0.01 ):
                    strongly_rejected_opt = True

                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('Optimal fitting                             ')
                print('Number of samples xmin_optimal            : ', data_optimal.shape[0])
                print('params_optimal                            : ', params_optimal)
                print('xmin_optimal                              : ', xmin_optimal)
                print('ecdf(xmin_optimal)                        : ', round(ecdf(xmin_optimal)*100, 2), ' (%)')
                print('(p-val) kstest - tail only - xmin_optimal : ', round(p_val_optimal, 2))
                print(' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .')
                print('strongly_rejected                         : ', 'True' if (strongly_rejected_opt) else 'False')
                print('As good as possible alternative (dist,p)  : ', (negative_d, round(negative_p,5)))

                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('Relaxed fitting                             ')
                print('Number of samples xmin_relaxed            : ', data_relaxed.shape[0])
                print('params_relaxed                            : ', params_relaxed)
                print('xmin_relaxed                              : ', xmin_relaxed)
                print('ecdf(xmin_relaxed)                        : ', round(ecdf(xmin_relaxed)*100, 2), ' (%)')
                print('(p-val) kstest - tail only - xmin_relaxed : ', round(p_val_relaxed, 2))

                print(' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .')
                relaxed_accept = 0
                repeat_ = 2500 # See Clauset_2009 (Power-Law Distributions in Empirical Data)
                for _ in range(repeat_):
                    synthetic_data = inv_ecdf(np.random.uniform(ecdf(np.min(data)), ecdf(np.max(data)), size=data.shape[0]))
                    toCompare_with = inv_ecdf(np.random.uniform(ecdf(np.min(data)), ecdf(np.max(data)), size=data.shape[0]))

                    # 1. optimal model: simulates the observed data with the ecdf up to xmin_optimal and then with the selected distro
                    # 2. relaxed model: simulates the observed data with the ecdf up to xmin_relaxed and then with the selected distro
                    # The reference model is the optimal one.
                    
                    optimal_model = np.copy(synthetic_data)
                    inds = np.where(optimal_model >= xmin_optimal)
                    optimal_model[inds] = distro.rvs(*params_optimal, size=inds[0].shape[0])
                    optimal_model = optimal_model[ (~np.isnan(optimal_model)) & (~np.isinf(optimal_model)) ]
                    optimal_model = optimal_model[ optimal_model < (((top_wall - bottom_wall) - 5.0) / tau) ]
                    D_opt = astats.kuiper_two(toCompare_with, optimal_model)[0]

                    relaxed_model = np.copy(synthetic_data)
                    inds = np.where(relaxed_model >= xmin_relaxed)
                    relaxed_model[inds] = distro.rvs(*params_relaxed, size=inds[0].shape[0])
                    relaxed_model = relaxed_model[ (~np.isnan(relaxed_model)) & (~np.isinf(relaxed_model)) ]
                    relaxed_model = relaxed_model[ relaxed_model < (((top_wall - bottom_wall) - 5.0) / tau) ]
                    D_rel = astats.kuiper_two(toCompare_with, relaxed_model)[0]

                    if (D_rel <= D_opt):
                        relaxed_accept += 1

                p_val_relaxed = round(relaxed_accept/repeat_, 2)
                print('p-val of relaxed model                    : ', p_val_relaxed)

                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('One-Zone Simulation for optimal model.')
                # int(4808*0.82): 4808 number of activated PLTs per ul (see Chopard_2017 - A physical description of the adhesion and aggregation of platelets). We deal with 0.82ul -> 4808*0.82
                # tau is the time step of the random walks in ms
                # 820um is the height of Impact-R PLT function analyser (and thus the *0.82)
                PLTs_ = PLTs(int(4808*0.82), tau, 820.0, [{'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}], [{'distro':distro, 'params':params_optimal}], [xmin_optimal])
                try:
                    PLTs_.advance(int(20000/tau))
                    depositedPLTs_opt = int(PLTs_.depositedPLTs()/0.82)
                    MSD_fitting_prms, distFromWalls_prms = PLTs_.meta_quantities()
                except:
                    depositedPLTs_opt = 0
                    MSD_fitting_prms, distFromWalls_prms  = (), ()
                print('deposited PLTs (per uL)                   : ', depositedPLTs_opt)
                print('MSD non-linear fitting [um^2,ms]          : ', MSD_fitting_prms)
                print("Avg Dist Walls linear fitting [um,ms]     : ", distFromWalls_prms)
                
                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('One-Zone Simulation for relaxed model.')
                PLTs_ = PLTs(int(4808*0.82), tau, 820.0, [{'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}], [{'distro':distro, 'params':params_relaxed}], [xmin_relaxed])
                try:
                    PLTs_.advance(int(20000/tau))
                    depositedPLTs_rel = int(PLTs_.depositedPLTs()/0.82)
                    MSD_fitting_prms, distFromWalls_prms = PLTs_.meta_quantities()
                except:
                    depositedPLTs_rel = 0
                    MSD_fitting_prms, distFromWalls_prms  = (), ()
                print('deposited PLTs (per uL)                   : ', depositedPLTs_rel)
                print('MSD non-linear fitting [um^2,ms]          : ', MSD_fitting_prms)
                print("Avg Dist Walls linear fitting [um,ms]     : ", distFromWalls_prms)

                print('------------------------------------------------------------')
  
            print("#######################################################################")
    #######################################################################

    #######################################################################
    if (do_what == 'MFP'):
        avg_ = 0.
        for PLT in Bodies:
            avg_ += (abs(integrals_gT[PLT] - integrals_tau[PLT]) / abs(integrals_gT[PLT])) * 100.
        avg_ /= numBodies
        ground_truth_diff.append(avg_)

Example #46

0

Show file

st.probplot(mdf.resid, plot=ax)
plt.show()

fig = plt.figure(figsize=(16, 9))
ax = sns.distplot(mdf.resid,
                  hist=False,
                  kde_kws={
                      "shade": True,
                      "lw": 1
                  },
                  fit=st.norm)
ax.set_xlabel("Residuals")
plt.show()

labels = ["Statistic", "p-value"]
norm_res = st.shapiro(mdf.resid)
for key, val in dict(zip(labels, norm_res)).items():
    print(key, val)

fig = plt.figure(figsize=(16, 9))
ax = sns.scatterplot(y=mdf.resid, x=mdf.fittedvalues)
ax.set_xlabel("Fitted Values")
ax.set_ylabel("Residuals")
plt.show()

het_white_res = het_white(mdf.resid, mdf.model.exog)
labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"]
for key, val in dict(zip(labels, het_white_res)).items():
    print(key, val)

fig = plt.figure(figsize=(16, 9))

Example #47

0

Show file

def frequency_increment_test(time, values, clip=True):
    Y = frequency_increment_values(time, values, clip=clip)
    T, Tp = stats.ttest_1samp(Y, 0)
    W, Wp = stats.shapiro(Y)
    return {"T": T, "Tp": Tp, "W": W, "Wp": Wp}

Example #48

0

Show file

File: teleconnections_region.py Project: NunoEdgarGFlowHub/climada_papers

for i, test_region in enumerate(test_regions):

    print(test_region)
    DATA_region = DATA_TS[(DATA_TS['Region'] == test_region)
                          & (DATA_TS['Year'] < 2011) &
                          (DATA_TS['Year'] > 1970)]

    climateData = np.array(DATA_region['Norm_ImpFix_2y_offset'])
    auto_corr = test_autocorrelation(climateData)

    if normClim is True:

        climateData = climateData / np.nanmax(climateData)

    t, shap_log = shapiro(np.log(climateData))

    t, shap_norm = shapiro(climateData)

    best_model, y, x, maxi, pearson_corr, best_loo, loos, combs = find_best_model(
        climateData, telecon)

    comb_df = pd.DataFrame(combs)
    comb_df = comb_df.T

    loo_df = pd.DataFrame(loos)
    loo_df = loo_df.T
    loo_df.columns = ['log', 'identity', 'inverse-power']
    loo_df['combination'] = comb_df.iloc[:, 0]
    # store LooCV out-of-sample-errors
    loo_df.to_csv(

Example #49

0

Show file

File: graphs_t6.py Project: oscaralejandro1907/network-flows-in-python

# Analysis for Cluster 1:
for i in cluster1:
    #Maximum flow algorithm:
    flow_value,flow_dict = nx.maximum_flow(G, 0, i, capacity='weight')
    c1_values.append(flow_value)

    df=pd.DataFrame({'Cluster':[1],
                    'Flow_Value':flow_value})

    all_data=all_data.append(df)

mean=np.mean(c1_values)
std_dev=np.std(c1_values)

normality_test=stats.shapiro(c1_values)

print("Mean for Cluster 1:",mean)
print("Standard deviation for Cluster 1:",std_dev)
print("Normality test for Cluster 1:",normality_test,"\n")

#Histogram for cluster 1:
hist, bin_edges=np.histogram(c1_values,density=True)
first_edge, last_edge = np.min(c1_values),np.max(c1_values)

n_equal_bins = 15
bin_edges = np.linspace(start=first_edge, stop=last_edge,num=n_equal_bins + 1, endpoint=True)

plt.hist(c1_values,bins=bin_edges,rwidth=0.75)
plt.xlabel('Flow values')
plt.ylabel('Frequency')

Example #50

0

Show file

#
# The simplest transformation is Standard Scaling (or Z-score normalization):
#
# $$ \large z= \frac{x-\mu}{\sigma} $$
#
# Note that Standard Scaling does not make the distribution normal in the strict sense.

# In[ ]:

from sklearn.preprocessing import StandardScaler
from scipy.stats import beta
from scipy.stats import shapiro
import numpy as np

data = beta(1, 10).rvs(1000).reshape(-1, 1)
shapiro(data)

# In[ ]:

# Value of the statistic, p-value
shapiro(StandardScaler().fit_transform(data))

# With such p-value we'd have to reject the null hypothesis of normality of the data

# But, to some extent, it protects against outliers:

# In[ ]:

data = np.array([1, 1, 0, -1, 2, 1, 2, 3, -2, 4,
                 100]).reshape(-1, 1).astype(np.float64)
StandardScaler().fit_transform(data)

Example #51

0

Show file

File: 09-NOVO-2-Gerar TABELA ANOVA.py Project: leosilva/RM1-Assignment

        # print(data.head())

        lm = ols(formula='percentual_k_unordered ~ algoritmo', data=data).fit()
        anova = sm.stats.anova_lm(lm, typ=2)  # Type 2 ANOVA DataFrame

        tit = ' ANOVA para Probabilidade = %s e Tamanho = %s' % (prob, tam)
        hr = '=' * 60  #len(tit)
        anov = anova.head(10)

        s = '%s\n%s\n%s\n%s\n\n' % (hr, tit, hr, anov)
        arq_destino.write(s)
        print(s)

        #insere dados do Tete de Nomelidade
        s = '    * TESTE DE NORMALIDADE (SHAPIRO-WILK):\n'
        s += '      %s\n' % ('-' * (len(s) + 6))
        for alg in udata.ALGORTIMOS:
            d = data[data['algoritmo'] == alg]['percentual_k_unordered']
            W, p_value = stats.shapiro(d)
            s += '        - %s: W = %0.6f / p_value = %.6f \n' % (alg.ljust(9),
                                                                  W, p_value)
        s += '\n'
        arq_destino.write(s)
        print(s)

#fecha arquivo
arq_destino.close()

# pr_f = anova['PR(>F)'].values[0]
# print( '%s  /  %.55f' % (pr_f, pr_f) )

Example #52

0

Show file

def runShapiroTest(data, alpha):
    stats, pValue = shapiro(data)
    print('Statistics: {} | pValue: {} | Is Parametric: {}'.format(stats, pValue, pValue > alpha))

    return pValue > alpha

Example #53

0

Show file

File: Variables.py Project: tomyc/practical-machine-learning

print(data.Age.describe())

# Calculates the z score of each value
print(st.zscore([0.45, 23, 25, 28, 33, 60, 80]))

#  z score of a p-value and vice versa
print(st.norm.cdf(3.46))
print(st.norm.ppf(.95))
print(st.norm.cdf(1.64))

# Normality test
sm.qqplot(data.Age, line='45')
pylab.show()

data_no_missing = data.dropna()
stat, p = st.shapiro(data_no_missing.Age)
print('Statistics=%.3f, p=%.3f' % (stat, p))
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

# Embarked crosstab
print(pd.crosstab(index=data["Embarked"], columns="Count"))
print(data.Embarked.isnull().sum())

# Embarked barchart
sns.countplot(x="Embarked", data=data)
plt.show()

Example #54

0

Show file

for A, B in zip(sampleA, sampleB):
    fdata.write('\n' + A + ' is sample A' + '\n' + B + ' is sample B' + '\n')
    fdata.write('RVL comparaison' + '\n')
    fdata.write(
        str(np.mean(RVLdata_[A])) + '+-' + str(np.std(RVLdata_[A])) + '\n')
    fdata.write(
        str(np.mean(RVLdata_[B])) + '+-' + str(np.std(RVLdata_[B])) + '\n')
    print 'means'

    ###T-test
    pops = []
    pops.append(RVLdata_[A])
    pops.append(RVLdata_[B])
    ###Shapiro's test for normality for sample A
    w, pnormA = stats.shapiro(np.array(RVLdata_[A]))
    if pnormA > 0.05:
        normA = True
        print 'A sample IS normally distributed'
        fdata.write('A sample IS normally distributed' + '\n')
    else:
        normA = False
        print 'A sample is NOT normally distributed'
        fdata.write('A sample is NOT normally distributed' + '\n')
    ###Shapiro's test for normality for sample B
    w, pnormB = stats.shapiro(np.array(RVLdata_[B]))
    if pnormB > 0.05:
        normB = True
        print 'B sample IS normally distributed'
        fdata.write('B sample IS normally distributed' + '\n')
    else:

Example #55

0

Show file

File: normal distribution test.py Project: syx2017-bot/GH5-amino-acids-frequency-normal-distribution-test

from scipy.stats import shapiro
import matplotlib.pyplot as plt
from pandas.core.frame import DataFrame

data = np.genfromtxt("data.csv", delimiter=",")
listp = []
listw = []
listf = []
listr = [
    "Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", "Leu",
    "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr"
]
for i in range(0, 20):
    aa = data[1:, i]
    fig = plt.figure
    res = stats.probplot(aa, plot=plt)
    plt.show()
    w, p = shapiro(aa)
    listw.append(w)
    listp.append(p)
    if p >= 0.05:
        print("normal distribution")
        listf.append("normal distribution")
    else:
        print("abnormal distribution")
        listf.append("abnormal distribution")
    print("w：%f" % w, "p.value：%f" % p)
dic = {"residue index": listr, "W": listw, "P.value": listp, "F": listf}
output = DataFrame(dic)
print(output)

Example #56

0

Show file

 # for col in train_num.columns:
 #       train_num[col].plot.hist(title = col)
 #       s = train_num.describe()[col].to_string() + \
 #           "\nMissing Values: " + str(train_num.isnull().sum()[col]) + \
 #           "\nMissing Values %: " + str(round(train_num.isnull().sum()[col]/len(train_num),4))
 #       plt.figtext(1, 0.5, s)
 #       plt.show()
 
 droped_ttest_cols = []         
 # * Evaluar normalidad "skewness"
 target = train_temp[label]
 t_sel = [0] * len(train_num.columns) # señala qué variables pueden ayudar a predecir target
 t_ctr = 0 # contador
 for col in train_num.columns:
     # Shapiro-Wilk test
     stat, p = shapiro(train_num[col])
     #print('Statistics={:.3f}, p={:.3f}'.format(stat, p))
     
     if p > 0.05: # no se rechaza la H0 según la cual la distribución de estos datos es similar a la gaussiana
         # t-test
         # print(col)
         # separación de datos según la aceptación del crédito
         t0 = train_num[col][target == 0]
         t1 = train_num[col][target == 1]
         stat, p = ttest_ind(t0, t1, nan_policy = "omit", equal_var = False)
         # print('T-statistic={:.3f}, p={:.3f}'.format(stat, p))
         
         if p < 0.05: # se rechaza la H0 según la cual las medias de t0 y t1 no difieren significativamente
             t_sel[t_ctr] = 1
         else:
             droped_ttest_cols.append(col)

Example #57

0

Show file

 plt.figure()
 plt.plot(x, gNorm, 'r-', label='Norm PDF')
 plt.plot(x, sNorm, 'g-', label='Skewed Norm PDF')
 #plt.plot(x, chiSq,'m-', label='Chi-Square PDF')
 plt.bar(bin_edges[:-1],
         hist,
         width=(max(bin_edges) - min(bin_edges)) / iters)
 plt.title('NRMSE distribution')
 plt.xlim(np.min(x), np.max(x))
 plt.legend()
 # Print Normality Test results (Variable and p-value)
 #   Kolmogorov-Smirnov
 print(stats.kstest(y, 'norm', args=(mean, var)))
 print(stats.kstest(y, 'skewnorm', args=(smean, svar, sk)))
 #   Shapiro-Wilk
 print(stats.shapiro(y))
 #   Chi-Square
 #print(stats.chisquare(hist, gNorm))
 #print(stats.chisquare(hist, sNorm))
 # Q-Q plots
 #f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2)
 f, (ax1, ax2) = plt.subplots(1, 2)
 plt.title("Q-Q Plots")
 res = stats.probplot(y, dist=stats.norm(mean, var), plot=ax1)
 ax1.set_title("Normality Test (Non-Skewed)")
 resS = stats.probplot(y, dist=stats.skewnorm(smean, svar, sk), plot=ax2)
 ax2.set_title("Normality Test (Skewed)")
 #resX2 = stats.probplot(y, dist=stats.chi2(4), plot=ax3)
 #ax3.set_title("Chi-Square Test (k=4)")
 #resX2 = stats.probplot(y, dist=stats.chi2(10), plot=ax4)
 #ax4.set_title("Chi-Square Test (k=10)")

Example #58

0

Show file

File: dTree_CV.py Project: cloudfast-bit/Exp001

#bins = 10

print("Bins No (Sturge’s Rule): ", bins)
plt.hist(scores, bins=bins)
plt.ylabel('Probability')
plt.xlabel("Accuracy")
plt.title("Accuracy of " + model_label + " with CV=" + str(cv))
plt.show()
qqplot(scores, line='s')
plt.title("Accuracy of " + model_label + " with CV=" + str(cv))
plt.show()

alpha = 0.05

print("Shapiro-Wilk Test result:")
stat, p = shapiro(scores)
print('     Statistics=%.3f, p=%.3f, alpha=%.3f' % (stat, p, alpha))
if p > alpha:
    print('     Sample looks Gaussian (fail to reject H0)')
else:
    print('     Sample does not look Gaussian (reject H0)')

print("D’Agostino’s K^2 Test result:")
stat, p = normaltest(scores)
print('     Statistics=%.3f, p=%.3f, alpha=%.3f' % (stat, p, alpha))
if p > alpha:
    print('     Sample looks Gaussian (fail to reject H0)')
else:
    print('     Sample does not look Gaussian (reject H0)')

print("Anderson-Darling Test result:")

Example #59

0

Show file

File: normalTest.py Project: xuegege5290/Benchmarking-DNN-applications-industry4.0

# Example of the Shapiro-Wilk Normality Test
from scipy.stats import shapiro
data = [
    13.83, 14.47, 14.03, 15.46, 15.61, 13.6, 15.26, 14.13, 14.41, 13.7, 14.23,
    14.49, 14.0, 13.73, 13.92, 13.82, 13.81, 13.88, 13.71, 14.08, 14.1, 13.38,
    13.69, 13.56, 13.57, 13.63, 13.59, 13.64, 13.97, 13.29, 13.72
]
stat, p = shapiro(data)
print('stat=%.3f, p=%.3f' % (stat, p))
f = open("NormalTestShapiro-Wilk.txt", "a")

if p > 0.05:
    print('Probably Gaussian')
    f.write('Probably Gaussian\n')
else:
    print('Probably not Gaussian')
    f.write('Probably not Gaussian\n')

f.write("Stat: {0}s and  p: {1}s\n".format(stat, p))

f.close()

Example #60

0

Show file

 def get_statistic_and_pvalue(self, y):
     return shapiro(y)