Example #1
0
    def test_basic(self):
        x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46,
              4.43,0.21,4.75,0.71,1.52,3.24,
              0.93,0.42,4.97,9.53,4.55,0.47,6.66]
        w,pw = stats.shapiro(x1)
        assert_almost_equal(w,0.90047299861907959,6)
        assert_almost_equal(pw,0.042089745402336121,6)
        x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11,
              3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69,
              0.08,3.67,2.81,3.49]
        w,pw = stats.shapiro(x2)
        assert_almost_equal(w,0.9590270,6)
        assert_almost_equal(pw,0.52460,3)

        # Verified against R
        np.random.seed(12345678)
        x3 = stats.norm.rvs(loc=5, scale=3, size=100)
        w, pw = stats.shapiro(x3)
        assert_almost_equal(w, 0.9772805571556091, decimal=6)
        assert_almost_equal(pw, 0.08144091814756393, decimal=3)

        # Extracted from original paper
        x4 = [0.139, 0.157, 0.175, 0.256, 0.344, 0.413, 0.503, 0.577, 0.614,
              0.655, 0.954, 1.392, 1.557, 1.648, 1.690, 1.994, 2.174, 2.206,
              3.245, 3.510, 3.571, 4.354, 4.980, 6.084, 8.351]
        W_expected = 0.83467
        p_expected = 0.000914
        w, pw = stats.shapiro(x4)
        assert_almost_equal(w, W_expected, decimal=4)
        assert_almost_equal(pw, p_expected, decimal=5)
Example #2
0
    def mleWithSgd(self,x_array,y_array):
        a,b,theta,loss = random.random(),random.random(),random.random(),2**31
        optimal_a,optimal_b,optimal_theta = 0,0,0
        for i in xrange(len(x_array)):
            x,y = x_array[i],y_array[i]
            a = a - self.learning_rate * (1/(theta*x)*(a*x+b-y))
            b = b - self.learning_rate * (1/(theta*(x**2))*(a*x+b-y))
            theta = theta - self.learning_rate * (-((y-a*x-b)**2)/((x**2)*(theta**3)) - theta)
            curr_loss = self.mleLossFunc(x_array,y_array,a,b,theta)
            if curr_loss<=loss:
                self.learning_rate*=1.05
                optimal_a,optimal_b,optimal_theta = a,b,theta
            else:
                self.learning_rate*=0.5
                a,b,theta = optimal_a,optimal_b,optimal_theta
            loss = curr_loss

            print curr_loss

        print "Output:"
        #print a,b,theta,self.learning_rate
        print optimal_a,optimal_b,optimal_theta,len(x_array)
        
        # Evaluation
        # Perform Shapiro-Wilk test
        # Which tests the null hypothesis that the data was drawn from a normal distribution.
        normalized_array = np.array([(y_array[i]-optimal_a*x_array[i]-optimal_b)/(optimal_theta*x_array[i]) for i in range(len(x_array))])
        print stats.shapiro(normalized_array)
        #plt.plot(list(x_array),list(y_array),'ro')
        #plt.show()
        return optimal_a,optimal_b,optimal_theta
Example #3
0
    def test(self, arr1, arr2):
        p_value = 0
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if stats.levene(arr1, arr2)[1] > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    p_value = stats.ttest_ind(arr1, arr2)[1]
                else:
                    # p = Mann
                    if equal(arr1, arr2):
                        p_value = 1
                    else:
                        p_value = stats.mannwhitneyu(arr1, arr2)[1]
            else:
                p_value = stats.ttest_ind(arr1, arr2, False)[1]

        elif self.statistics == "student":
            p_value = stats.ttest_ind(arr1, arr2)[1]
        elif self.statistics == "welch":
            p_value = stats.ttest_ind(arr1, arr2, False)[1]
        elif self.statistics == "mann":
            if equal(arr1, arr2):
                p_value = 1
            else:
                p_value = stats.mannwhitneyu(arr1, arr2)[1]
        return p_value
Example #4
0
    def mleWithSgdNonlinear(self,x_array,y_array):
        a,b,theta0,theta1,theta2,loss = random.random(),random.random(),random.random(),random.random(),random.random(),2**31
        optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 = 0,0,0,0,0
        for i in xrange(len(x_array)):
            x,y = x_array[i],y_array[i]
            a = a - self.learning_rate * (x*(a*x+b-y)/(theta0*x**2+theta1*x+theta2) + self.reg_cof*a)
            b = b - self.learning_rate * ((a*x+b-y)/(theta0*x**2+theta1*x+theta2) + self.reg_cof*b)
            theta0 = theta0 - self.learning_rate * ((- (x**2) * ((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + x**2/(theta0*(x**2)+theta1*x+theta2) ) + self.reg_cof*theta0)
            theta1 = theta1 - self.learning_rate * ((-x * ((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + x/(theta0*(x**2)+theta1*x+theta2) ) + self.reg_cof*theta1) 
            theta2 = theta2 - self.learning_rate * (-((y-a*x-b)**2)/((theta0*(x**2)+theta1*x+theta2)**3) + 1/(theta0*(x**2)+theta1*x+theta2) + self.reg_cof*theta2)
            curr_loss = self.mleLossNonlinear(x_array,y_array,a,b,theta0,theta1,theta2)
            
            if curr_loss<loss:
                self.learning_rate*=1.05
                optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2 = a,b,theta0,theta1,theta2
            else:
                self.learning_rate*=0.5
                a,b,theta0,theta1,theta2 = optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2
            
            loss = curr_loss
            print curr_loss
        print "Output:"
        #print a,b,theta,self.learning_rate
        print optimal_a,optimal_b,optimal_t0,optimal_t1,optimal_t2
        
        # Evaluation
        # Perform Shapiro-Wilk test
        # Which tests the null hypothesis that the data was drawn from a normal distribution.
        normalized_array = np.array([(y_array[i]-optimal_a*x_array[i]-optimal_b)/(optimal_t0*(x_array[i]**2)+optimal_t1*x_array[i]+optimal_t2) for i in range(len(x_array))])

        print stats.shapiro(normalized_array)
        plt.plot(normalized_array,[1]*len(normalized_array),'ro')
        plt.show()
  def main():
    if len(sys.argv) < 4:
      return 1
    _, list_a, list_b, significance = sys.argv[:4]
    list_a = json.loads(list_a)
    list_b = json.loads(list_b)
    significance = float(significance)

    shapiro_p_value = stats.shapiro(list_a)[1], stats.shapiro(list_b)[1]
    mann_whitney_p_value = stats.mannwhitneyu(list_a, list_b).pvalue
    anderson_p_value = stats.anderson_ksamp([list_a, list_b]).significance_level
    welch_p_value = stats.ttest_ind(list_a, list_b, equal_var=False)[1]

    results = {
        'first_sample': list_a,
        'second_sample': list_b,
        'shapiro_p_value': shapiro_p_value,
        'mann_p_value': mann_whitney_p_value,
        'anderson_p_value': anderson_p_value,
        'welch_p_value': welch_p_value,
    }

    if (results['shapiro_p_value'][0] < significance and
        results['shapiro_p_value'][1] < significance):
      results['normal-y'] = True
    else:
      results['normal-y'] = False
    results['significantly_different'] = bool(
        float(results['mann_p_value']) < float(significance))

    print json.dumps(results)
    return 0
def boxcoxtrans(str,list):
        s=list
        w = pd.read_csv(str, usecols=s)

        f = DataFrame(w)
        c = f.astype(float)

        x = c.as_matrix()


        e = []

        for j in np.linspace(-2, 2, num=21):

                if j != 0:

                    b =(x**j)

                    d=[]
                    c=[]
                    for i in range(0,len(b)):
                        c = b[i]
                        d.append(c[0])
                    

                    t = stats.shapiro(d)
                    
                    
                    e.append(t[1])




        for i in range(0,len(e)):

            if e[i]==max(e):

                break
        t=(-2+0.2*i)

        if t>=0:
            t=(-2+0.2*(i+1))

        print 'optimal lembda=',t

        h=((x**t)-1)/t
        l=[]
        m=[]
        for i in range(0,len(h)):
            l = h[i]
            m.append(l[0])


        print pd.DataFrame(m)
        k=stats.shapiro(m)

        print 'shapiro test of trans column',k
Example #7
0
    def return_test_results(self, arr1, arr2):
        test_name = ""
        p_value = 0
        t_value = 0
        levene = stats.levene(arr1, arr2)[1]
        if self.statistics == "auto":
            # проверяем Левеном на равенство дисперсий. Если равны
            if levene > 0.05:
                # Шапир на нормальность выборок. Если нормальные
                if stats.shapiro(arr1)[1] > 0.05 and stats.shapiro(arr2)[1] > 0.05:
                    # p = Student
                    test_name = "Student"
                    result = stats.ttest_ind(arr1, arr2)
                    t_value = result[0]
                    p_value = result[1]
                else:
                    # p = Mann
                    test_name = "Mann"
                    if equal(arr1, arr2):
                        t_value = None
                        p_value = 1
                    else:
                        result = stats.mannwhitneyu(arr1, arr2)
                        t_value = result[0]
                        p_value = result[1]
            else:
                test_name = "Welch"
                result = stats.ttest_ind(arr1, arr2, False)
                t_value = result[0]
                p_value = result[1]

        elif self.statistics == "student":
            test_name = "Student"
            result = stats.ttest_ind(arr1, arr2)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "welch":
            test_name = "Welch"
            result = stats.ttest_ind(arr1, arr2, False)
            t_value = result[0]
            p_value = result[1]
        elif self.statistics == "mann":
            test_name = "Mann"
            if equal(arr1, arr2):
                t_value = None
                p_value = 1
            else:
                result = stats.mannwhitneyu(arr1, arr2)
                t_value = result[0]
                p_value = result[1]

        df = len(arr1) + len(arr2) - 2

        return [test_name, t_value, p_value, df, levene]
def test_sample_means_and_var_distribution(N, Pis, sample_size, multi, n_test):
    x_pvalues = []
    y_pvalues = []
    passed = []
    for i in range(n_test):
        x, y = multinomial_mean_and_var_errors(N, Pis, sample_size, multi)
        x_pvalue = spstats.shapiro(x)[1]
        y_pvalue = spstats.shapiro(y)[1]
        x_pvalues.append(x_pvalue)
        y_pvalues.append(y_pvalue)
        passed.append(min(x_pvalue, y_pvalue) >= .05)
    assert np.sum(np.array(passed)) >= .6 * n_test
def check_normality():
    '''Check if the distribution is normal.'''
    
    # Set the parameters
    numData = 1000
    myMean = 0
    mySD = 3
    
    # To get reproducable values, I provide a seed value
    np.random.seed(1234)   
    
    # Generate and show random data
    data = stats.norm.rvs(myMean, mySD, size=numData)
    fewData = data[:100]
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    pFewVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['Omnibus']    = stats.normaltest(data)
    _, pFewVals['Omnibus'] = stats.normaltest(fewData)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk']    = stats.shapiro(data)
    _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)
    
    # Or you can check for normality with Lilliefors-test
    _, pVals['Lilliefors']    = lillifors(data)
    _, pFewVals['Lilliefors'] = lillifors(fewData)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')
    
    print('p-values for all {0} data points: ----------------'.format(len(data)))
    print(pVals)
    print('p-values for the first 100 data points: ----------------')
    print(pFewVals)
    
    if pVals['Omnibus'] > 0.05:
        print('Data are normally distributed')
    # --- >>> STOP stats <<< ---
    
    return pVals['Kolmogorov-Smirnov']
Example #10
0
 def test_basic(self):
     x1 = [0.11,7.87,4.61,10.14,7.95,3.14,0.46,
           4.43,0.21,4.75,0.71,1.52,3.24,
           0.93,0.42,4.97,9.53,4.55,0.47,6.66]
     w,pw = stats.shapiro(x1)
     assert_almost_equal(w,0.90047299861907959,6)
     assert_almost_equal(pw,0.042089745402336121,6)
     x2 = [1.36,1.14,2.92,2.55,1.46,1.06,5.27,-1.11,
           3.48,1.10,0.88,-0.51,1.46,0.52,6.20,1.69,
           0.08,3.67,2.81,3.49]
     w,pw = stats.shapiro(x2)
     assert_almost_equal(w,0.9590270,6)
     assert_almost_equal(pw,0.52460,3)
def main_plot_histogram():
	sigma = 0.10

	# initial setup
	W = dist_W(sigma)
	WI = dist_WI()

	#h, hist_edges = compute_histogram(W, WI, params)
	S, PS = compute_histogram(W, WI, params)
	S = S.flatten()
	PS = PS.flatten()
	#kindofvector(h)
	#kindofvector(hist_edges)
	#print(h)
	#print(hist_edges)
	#plt.plot(hist_edges, h)
	BINCNT = 100

	plt.hist(S,  bins=BINCNT, normed=True, histtype='step', alpha=1, label="act after tanh", color="b")
	plt.hist(PS, bins=BINCNT, normed=True, histtype='step', alpha=1, label="act before tanh", color="g")

	
	#W = shapiro(S)
	print("S size = ", S.size)
	print("shapiro S = ",shapiro(S))
	print("shapiro PS = ",shapiro(PS))
	stdS = std(S)
	print("stdS=",stdS)
	stdPS = std(PS)
	print("stdPS=", stdPS)

	x = linspace(-1, 1, 100)
	y = norm.pdf(x, loc=0, scale=stdS)
	plt.plot(x,y, color="b", alpha=0.2)

	y = norm.pdf(x, loc=0, scale=stdPS)
	plt.plot(x,y, color="g", alpha=0.2)


	#blue_line = mlines.Line2D([], [], color='blue', marker='.', markersize=15, label='Blue stars')
	

	plt.grid(True)
	plt.ylabel('density')
	plt.xlabel('activation value')
	plt.xlim([-1, 1])
	plt.title('activation distibution in reservoir ($\sigma_{blue}$=%.2f, $\sigma_{green}$=%.2f)' % (stdS, stdPS))

	plt.legend()
	plt.show()
Example #12
0
def test_routehop_normality(rows, attributes, key):
    print "Splitting..."
    instances = split_on_attributes(attributes, rows)

    print "Processing..."
    toofew = 0
    nonnormal = 0
    normal = 0
    for skey in instances.keys():
        times = array([s[key] for s in instances[skey]])
        n = len(times)
        mean_time = times.mean()
        std_time = times.std()

        if n >= 30:
            pval = stats.shapiro(times)[1]
            if pval < 0.05:
                nonnormal += 1
                # figure()
                # hist(times)
                # title("%s (p-val=%f, %d pts)" %(str(skey),pval,n));
            else:
                normal += 1
        else:
            toofew += 1

    print "Non,toofew,normal:", nonnormal, toofew, normal
Example #13
0
def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['omnibus'] = stats.normaltest(data)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    
    # Or you can check for normality with Lilliefors-test
    ksStats, pVals['Lilliefors'] = kstest_normal(data)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    
    print(pVals)
    if pVals['omnibus'] > 0.05:
        print('Data are normally distributed')
Example #14
0
    def test(self, alpha, x):
        """
        Tests whether alpha and x are significantly correlated.
        The test assumes that x is normally distributed. The test
        function uses a Shapiro-Wilk test to test this assumption.

        :param alpha: independent variable, angles in radians
        :param x: dependent variable
        :return: test results of Shapiro-Wilk and Liddell-Ord test
        :rtype: pandas.DataFrame

        References: [Jammalamadaka2001]_
        """
        w, psw = stats.shapiro(x)
        if psw < 0.05:
            warnings.warn("This test requires Gaussian distributed x")

        rxc, rxs, rcs = np.corrcoef(x, np.cos(alpha))[0,1], np.corrcoef(x, np.sin(alpha))[0,1], \
                        np.corrcoef(np.cos(alpha), np.sin(alpha))[0,1]
        n = len(alpha)
        r2 = (rxc**2 + rxs**2 - 2*rxc*rxs*rcs)/(1 - rcs**2)
        f = (n-3)*r2/(1-r2)
        p = stats.f.sf(f, 2, n-3)

        df = pd.DataFrame(dict(
            test = ['Shapiro-Wilk','Liddell-Ord'],
            statistics = [w, f],
            p = [psw, p],
            dof = [None, (2, n-3)]
        )).set_index('test')
        return df
def robust_parameter(clusters, stats, elems):
    ''' Parameter to measure robustness of a G-mode test.
        
        The parameter is given by the weighted average plus a normality estimator:
        
        P1 = SUM( N * var ) / SUM( N )
        P2 = SUM( N^-1 * var ) / SUM( N^-1 ) 
        P3 = SUM( kstest(cluster, gaussian) )
        P = (P1/w1 + P2/w2 + P3/w3) / (w1^-1 + w2^-1 + w3^-1)
    '''
    from scipy.stats import shapiro
    from math import sqrt
    from itertools import izip
    
    shap, N, var = deque(), deque(), deque()
    for members, cl in izip(clusters, stats):
        # cluster size array
        N.append(len(members))
        # cluster variance array
        var.append(asum(cl[1]**2))
        # shapiro-wilk test:
        W_vec = array([shapiro(elems[members][n])[0]**2 for n in xrange(len(elems[0]))])
        # inversed shapiro-wilk W statistic.
        shap.append( sqrt(asum(1e0/W_vec)) )

    shap, N, var = array(shap), array(N), array(var)
    
    w1 =  sqrt(asum(mad(var, median(var))**2))
    w3 =  mad(shap, median(shap))
 
    p1 = asum( N * var ) / asum(N)
    p2 = asum( var/N ) / asum(1e0/N)
    p3 = median(shap)
    
    return (p1/w1 + p2/w1 + p3/w3) / (2e0/w1 + 1e0/w3)
Example #16
0
def sw(errors):
    """
    Shapiro Wilk Test

    The Null hypothesis for SW test is that the data forms a normal 
    distribution.

    Parameters
    -------------
    errors: error of voxels through time (shape of it is 221783*1)

    Returns
    ---------
    swstat: test statistics for SW test
    pval: P-value for the hypothesis test.
    """
    
    pval = []

    for i in range(errors.shape[-1]):
        pval.append(shapiro(errors[:,i])[1])

    pval = np.array(pval)
    shap=pval.shape[0]
    pval = np.reshape(pval, (shap, 1))


    return pval
 def shapiro_test(self, param):
     from scipy.stats import shapiro
     all_values = self._get_single_param_values(param)
     results = []
     for key, values in all_values:
         results.append((key, shapiro(sorted(values))))
     return results
Example #18
0
 def _box_cox_transform(self, verbose=False, method='standard'):
     """
     Performs the Box-Cox transformation, over different ranges, picking the optimal one w. respect to normality.
     """
     from scipy import stats
     a = sp.array(self.values)
     if method == 'standard':
         vals = (a - min(a)) + 0.1 * sp.var(a)
     else:
         vals = a
     sw_pvals = []
     lambdas = sp.arange(-2.0, 2.1, 0.1)
     for l in lambdas:
         if l == 0:
             vs = sp.log(vals)
         else:
             vs = ((vals ** l) - 1) / l
         r = stats.shapiro(vs)
         if sp.isfinite(r[0]):
             pval = r[1]
         else:
             pval = 0.0
         sw_pvals.append(pval)
     i = sp.argmax(sw_pvals)
     l = lambdas[i]
     if l == 0:
         vs = sp.log(vals)
     else:
         vs = ((vals ** l) - 1) / l
     self._perform_transform(vs,"box_cox")
     log.debug('optimal lambda was %0.1f' % l)
     return True
Example #19
0
def statFile(key, values, pruneX, pruneX2):
    oFilename = values["file"]+".csv"
    data = []
    prune = False
    minX = 0
    maxX = 0

    if len(pruneX) > 0 and len(pruneX2) > 0:
        minX = float(pruneX)
        maxX = float(pruneX2)
        prune = True

    with open(oFilename) as f:
        for l in f.readlines():
            arrLine = l.strip().split()
            if len(arrLine) == 2:
                t = float(arrLine[0])
                if prune:
                    if t >= minX and t <= maxX:
                        data.append(float(arrLine[1]))
                    elif t > maxX:
                        break
                else:
                    data.append(float(arrLine[1]))
    x = np.array(data)
    with open("stats.txt", "a") as f:
        f.write(key + ":\n")
        f.write("  mean: "+str(x.mean())+"\n")
        f.write("  std: "+str(x.std())+"\n")
        f.write("  median: "+str(np.median(x))+"\n")
        f.write("  min: "+str(x.min())+"\n")
        f.write("  max: "+str(x.max())+"\n")
        f.write("  normality: "+str(stats.shapiro(x)[1])+"\n")
Example #20
0
def IsNormallyDistributed(sample, significance_level=0.05,
                          return_p_value=False):
  """Calculates Shapiro-Wilk test for normality.

  Note that normality is a requirement for Welch's t-test.

  Args:
    sample: List of values of benchmark result for a measure.
    significance_level: The significance level the p-value is compared against.
    return_p_value: Whether or not to return the calculated p-value.

  Returns:
    is_normally_distributed: Returns True or False.
    p_value: The calculated p-value.
  """
  if not stats:
    raise ImportError('This function requires Scipy.')

  # pylint: disable=unbalanced-tuple-unpacking
  _, p_value = stats.shapiro(sample)

  is_normally_distributed = p_value >= significance_level
  if return_p_value:
    return is_normally_distributed, p_value
  return is_normally_distributed
Example #21
0
    def gStats(self, missingValue=0.0):
        """dict of {geneID: (min,max,mean,median,std,stderr,
        Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...}
        """
        import scipy as S
        import scipy.stats as SS

        rv = {}
        for k, v in self.items():
            # print k,v
            va = S.array(self.gValues(k, missingValue))

            try:
                normaltest = SS.normaltest(va)
            except:
                normaltest = None
            try:
                shapiro = SS.shapiro(va)
            except:
                shapiro = None

            try:
                rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro)
            except:
                print k, va
                raise
        return rv
def multiple_comp (residuals): 
  """
  input: residuals, 2d array (voxels,timecourse)
  output: a list of the number of voxels that being tested as not normally distributed, based on 
  		alpha-test, Bonferroni procedure, Hochberg procedure and  Benjamini-Hochberg procedure respectively
  """

  ## Alpha Test
  p_nor = []
  for i in range(0,residuals.shape[0]):
      p_nor.append(stats.shapiro(residuals[i,:])[1])

  # for p<0.05, the voxel is not normal distributed
  p_nor_005 = [i for i in p_nor if i < 0.05]

  ##Bonferroni Procedure
  p_bonf = [i for i in p_nor if i < (0.05 / residuals.shape[0])]

  ## Hochberg Procedure
  p_nors = np.sort(p_nor)
  alpha = 0.05
  n=len(p_nors)
  tf=[]
  for i in range(0,n):
      thres = alpha/(n+1-(i+1))
      tf.append(p_nors[i]<=thres)

  ##Benjamini-Hochberg procedure
  tf_bh=[]
  for i in range(0,len(p_nors)):
      thres = (i/n)*alpha
      tf_bh.append(p_nors[i]<=thres)

  return [len(p_nor_005),len(p_bonf),sum(tf),sum(tf_bh)]
Example #23
0
def run(args):
    report = ResultReportWriter()

    team_names, results = rcss.run_matches(args.team_a, args.team_b, args.match_count)
    report.write_json('match_results.json', {
        'binaries': [args.team_a, args.team_b],
        'teams': team_names,
        'results': results,
    })

    errors = []
    score = [x - y for x, y in results]
    # alpha, 1 - alpha
    # alpha = probability of rejecting a true null hypothesis
    significance, confidence = (args.significance, 1 - args.significance)
    _, normality_p = stats.shapiro(score)
    if normality_p <= significance:
        errors.append('Shapiro test rejected normality')
    mean = numpy.mean(score)
    std_error = stats.sem(score)
    confidence_interval = stats.t.interval(confidence, len(score) - 1,
                                           loc=mean, scale=std_error)
    report.write_json('statistics.json', {
        'binaries': [args.team_a, args.team_b],
        'teams': team_names,
        'normality_p': normality_p,
        'score': [confidence_interval[0], mean, confidence_interval[1]],
        'score_std': std_error,
        'params': {
            'significance': args.significance,
        },
        'errors': errors,
    })
Example #24
0
def pearson_or_shapiro(data):
    """pearson_or_shapiro

    Use D'agostino/Pearson if possible (n >= 20), else Shapiro
    :param data:
    """
    return stats.normaltest(data) if len(data) >= 20 else stats.shapiro(data)
Example #25
0
    def test_nan_input(self):
        x = np.arange(10.)
        x[9] = np.nan

        w, pw = stats.shapiro(x)
        assert_equal(w, np.nan)
        assert_almost_equal(pw, 1.0)
Example #26
0
 def test_MultivariateNormalQMCEngineDegenerate(self, cuda=False):
     device = torch.device("cuda") if cuda else torch.device("cpu")
     for dtype in (torch.float, torch.double):
         # X, Y iid standard Normal and Z = X + Y, random vector (X, Y, Z)
         mean = torch.zeros(3, device=device, dtype=dtype)
         cov = torch.tensor(
             [[1, 0, 1], [0, 1, 1], [1, 1, 2]], device=device, dtype=dtype
         )
         engine = MultivariateNormalQMCEngine(mean=mean, cov=cov, seed=12345)
         samples = engine.draw(n=2000)
         self.assertEqual(samples.dtype, dtype)
         self.assertEqual(samples.device.type, device.type)
         self.assertTrue(torch.all(torch.abs(samples.mean(dim=0)) < 1e-2))
         self.assertTrue(torch.abs(torch.std(samples[:, 0]) - 1) < 1e-2)
         self.assertTrue(torch.abs(torch.std(samples[:, 1]) - 1) < 1e-2)
         self.assertTrue(torch.abs(torch.std(samples[:, 2]) - math.sqrt(2)) < 1e-2)
         for i in (0, 1, 2):
             _, pval = shapiro(samples[:, i].cpu().numpy())
             self.assertGreater(pval, 0.9)
         cov = np.cov(samples.cpu().numpy().transpose())
         self.assertLess(np.abs(cov[0, 1]), 1e-2)
         self.assertLess(np.abs(cov[0, 2] - 1), 1e-2)
         # check to see if X + Y = Z almost exactly
         self.assertTrue(
             torch.all(
                 torch.abs(samples[:, 0] + samples[:, 1] - samples[:, 2]) < 1e-5
             )
         )
Example #27
0
 def most_normal_transformation(self, pid, trans_types=['none', 'sqrt', 'log', 'sqr', 'exp', 'arcsin_sqrt'],
             perform_trans=True, verbose=False):
     """
     Performs the transformation which results in most normal looking data, according to Shapiro-Wilk's test
     """
     #raw_values = self.phen_dict[pid]['values']
     from scipy import stats
     shapiro_pvals = []
     for trans_type in trans_types:
         if trans_type != 'none':
             if not self.transform(pid, trans_type=trans_type):
                 continue
         phen_vals = self.get_values(pid)
         #print 'sp.inf in phen_vals:', sp.inf in phen_vals
         if sp.inf in phen_vals:
             pval = 0.0
         else:
             r = stats.shapiro(phen_vals)
             if sp.isfinite(r[0]):
                 pval = r[1]
             else:
                 pval = 0.0
         shapiro_pvals.append(pval)
         #self.phen_dict[pid]['values'] = raw_values
         if trans_type != 'none':
             self.revert_to_raw_values(pid)
     argmin_i = sp.argmax(shapiro_pvals)
     trans_type = trans_types[argmin_i]
     shapiro_pval = shapiro_pvals[argmin_i]
     if perform_trans:
         self.transform(pid, trans_type=trans_type)
     if verbose:
         print "The most normal-looking transformation was %s, with a Shapiro-Wilk's p-value of %0.6f" % \
             (trans_type, shapiro_pval)
     return trans_type, shapiro_pval
Example #28
0
 def most_normal_transformation(self,trans_types=SUPPORTED_TRANSFORMATIONS,
             perform_trans=True, verbose=False):
     """
     Performs the transformation which results in most normal looking data, according to Shapiro-Wilk's test
     """
     from scipy import stats
     shapiro_pvals = []
     for trans_type in trans_types:
         if trans_type == 'most_normal':
             continue
         if trans_type != 'none':
             if not self.transform(trans_type=trans_type):
                 continue
         phen_vals = self.values
         #print 'sp.inf in phen_vals:', sp.inf in phen_vals
         if sp.inf in phen_vals:
             pval = 0.0
         else:
             r = stats.shapiro(phen_vals)
             if sp.isfinite(r[0]):
                 pval = r[1]
             else:
                 pval = 0.0
         shapiro_pvals.append(pval)
         if trans_type != 'none':
             self.revert_to_raw_values()
     argmin_i = sp.argmax(shapiro_pvals)
     trans_type = trans_types[argmin_i]
     shapiro_pval = shapiro_pvals[argmin_i]
     if perform_trans:
         self.transform(trans_type=trans_type)
     log.info("The most normal-looking transformation was %s, with a Shapiro-Wilk's p-value of %.2E" % \
             (trans_type, shapiro_pval))
     return trans_type, shapiro_pval
 def distribution(self,gene,thresholdNorm):
     self.z,self.pval=stats.shapiro(gene[1:])
     if self.pval<thresholdNorm:
         #print 'not normal distribution'
         return self.pval
     else:
         #print'normal'
         return self.pval
def test_normality_increase_lambert():
    # Generate random data and check that it is more normal after inference
    for i, y in enumerate([np.random.standard_cauchy(size=ns), experimental_data]):
        print "Distribution %d" % i
        print "Before"
        print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(y)[0], shapiro(y)[0])).expandtabs(30)
        stats.probplot(y, dist="norm", plot=pylab)
        pylab.savefig("%d_before.png" % i)
        pylab.clf()

        tau = g.igmm(y)
        x = g.w_t(y, tau)
        print "After"
        print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(x)[0], shapiro(x)[0])).expandtabs(30)
        stats.probplot(x, dist="norm", plot=pylab)
        pylab.savefig("%d_after.png" % i)
        pylab.clf()
Example #31
0
def plot_boxplots(df):

    # %% boxplot chemotherapy
    fig, ax = plt.subplots(figsize=(12, 10))
    df_chemo = df.copy()
    df_chemo['Ablation Volume [ml] / Energy [kJ]'] = df_chemo[
        'Ablation Volume [ml]'] / df_chemo['Energy [kj]']
    df_chemo.dropna(subset=['Ablation Volume [ml] / Energy [kJ]'],
                    inplace=True)
    df_chemo.dropna(subset=['chemo_before_ablation'], inplace=True)
    df_chemo['chemo_before_ablation'].replace('No', False, inplace=True)
    df_chemo['chemo_before_ablation'].replace('Yes', True, inplace=True)

    df.dropna(subset=['Ablation Volume [ml]'], inplace=True)
    df.dropna(subset=['chemo_before_ablation'], inplace=True)
    df['chemo_before_ablation'].replace('No', False, inplace=True)
    df['chemo_before_ablation'].replace('Yes', True, inplace=True)
    # ttest
    no_chemo_df = df_chemo[df_chemo['chemo_before_ablation'] == False]
    no_chemo = no_chemo_df['Ablation Volume [ml]'].tolist()
    chemo_df = df_chemo[df_chemo['chemo_before_ablation'] == True]
    chemo = chemo_df['Ablation Volume [ml]'].tolist()

    fig, ax = plt.subplots(figsize=(12, 10))
    plt.hist(no_chemo)
    plt.title('No Chemotherapy')
    plt.ylabel('Ablation Volume [ml]')
    figpathHist = os.path.join("figures",
                               "histogram ablation volumes no chemo")
    gh.save(figpathHist, ext=['png'], close=True)
    fig1, ax = plt.subplots(figsize=(12, 10))
    plt.hist(chemo)
    plt.title('Chemotherapy')
    plt.ylabel('Ablation Volume [ml] ')
    figpathHist = os.path.join("figures", "histogram ablation volumes chemo")
    gh.save(figpathHist, ext=['png'], close=True)

    print('no of tumors with chemo:', str(len(chemo)))
    print('no of tumors with no chemo:', str(len(no_chemo)))
    #
    stat, p_chemo = shapiro(chemo)

    # interpret
    alpha_chemo = 0.05
    if p_chemo > alpha_chemo:
        msg = 'Sample Chemo looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample Chemo does not look Gaussian (reject H0)'
    print(msg)

    stat, p_no_chemo = shapiro(no_chemo)

    # interpret
    alpha_no_chemo = 0.05
    if p_no_chemo > alpha_no_chemo:
        msg = 'Sample No Chemo looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample No Chemo does not look Gaussian (reject H0)'
    print(msg)

    if p_no_chemo < alpha_no_chemo and p_chemo < alpha_chemo:
        t, p = stats.mannwhitneyu(chemo, no_chemo)
        print(
            'mann withney u test applied for samples coming from a non Gaussian distribution:'
        )
        print("t = " + str(t))
        print("p = " + str(p))
    else:
        t, p = stats.ttest_ind(chemo, no_chemo)
        print('ttest applied for samples coming from a Gaussian distribution:')
        print("t = " + str(t))
        print("p = " + str(p))

    fig, ax = plt.subplots(figsize=(12, 10))
    bp_dict = df.boxplot(column=['Ablation Volume [ml]'],
                         ax=ax,
                         notch=True,
                         by='chemo_before_ablation',
                         patch_artist=True,
                         return_type='both')
    ax.set_xlabel('')
    plt.show()
    for row_key, (ax, row) in bp_dict.iteritems():
        for i, box in enumerate(row['fliers']):
            box.set_marker('o')
        for i, box in enumerate(row['boxes']):
            if i == 0:
                box.set_facecolor('Purple')
                box.set_edgecolor('DarkMagenta')
            else:
                box.set_facecolor('LightPink')
                box.set_edgecolor('HotPink')
        for i, box in enumerate(row['medians']):
            box.set_color(color='Black')
            box.set_linewidth(2)
        for i, box in enumerate(row['whiskers']):
            box.set_color(color='Black')
            box.set_linewidth(2)
    xticklabels = [
        'No Chemotherapy before Ablation',
        'Chemotherapy Administered before Ablation'
    ]
    xtickNames = plt.setp(ax, xticklabels=xticklabels)
    plt.setp(xtickNames, fontsize=10, color='black')
    plt.ylim([-2, 120])
    plt.ylabel('Ablation Volume [ml]', fontsize=12, color='k')
    plt.tick_params(labelsize=10, color='black')
    ax.tick_params(colors='black', labelsize=10, color='k')
    ax.set_ylim([-2, 120])
    plt.xlabel('')
    fig.suptitle('')
    plt.title('')
    # plt.title('Comparison of Ratio (Ablation Volumes [ml] : Energy [kJ]) from MAVERRIC Dataset by Chemotherapy', fontsize=12)
    plt.title(
        'Comparison of Ablation Volumes [ml] from MAVERRIC Dataset by Chemotherapy',
        fontsize=12)
    figpathHist = os.path.join(
        "figures", "boxplot ablation volumes by chemo before ablation")
    gh.save(figpathHist, ext=['png'], close=True)

    # %% BOXPLOTS ABLATION VOLUMES

    # ttest
    df_volumes = df.copy()
    df_volumes.dropna(subset=['Ablation Volume [ml]'], inplace=True)
    df_volumes.dropna(subset=['Ablation Volume [ml] (manufacturers)'],
                      inplace=True)
    ablation_vol = df_volumes['Ablation Volume [ml]'].tolist()
    ablation_vol_brochure = df_volumes[
        'Ablation Volume [ml] (manufacturers)'].tolist()

    stat, p_brochure = shapiro(ablation_vol_brochure)
    # interpret
    alpha_brochure = 0.05
    if p_brochure > alpha_brochure:
        msg = 'Sample Ablation Volume Brochure looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample Ablation Volume Brochure does not look Gaussian (reject H0)'
    print(msg)

    stat, p_voxel = shapiro(ablation_vol)
    # interpret
    alpha_voxel = 0.05
    if p_voxel > alpha_voxel:
        msg = 'Sample Ablation Volume looks Gaussian (fail to reject H0)'
    else:
        msg = 'Sample Ablation Volume does not look Gaussian (reject H0)'
    print(msg)

    if p_voxel < alpha_voxel and p_brochure < alpha_brochure:
        t, p = stats.mannwhitneyu(ablation_vol, ablation_vol_brochure)
        print(
            'mann withney u test applied for samples coming from a non Gaussian distribution:'
        )
        print("t = " + str(t))
        print("p = " + str(p))
    else:
        t, p = stats.ttest_ind(ablation_vol, ablation_vol_brochure)
        print('ttest applied for samples coming from a Gaussian distribution:')
        print("t = " + str(t))
        print("p = " + str(p))

    fig, ax = plt.subplots(figsize=(12, 10))
    bp_dict = df.boxplot(column=[
        'Ablation Volume [ml]', 'Ablation Volume [ml] (parametrized_formula)',
        'Ablation Volume [ml] (manufacturers)'
    ],
                         ax=ax,
                         notch=True,
                         patch_artist=True,
                         return_type='both')
    ax.set_xlabel('')
    row = bp_dict.lines
    # for idx,row in enumerate(lines):
    for i, box in enumerate(row['fliers']):
        box.set_marker('o')
        # box.set_edgecolor('RoyalBlue')
    for i, box in enumerate(row['boxes']):
        if i == 0:
            box.set_facecolor('Blue')
            box.set_edgecolor('MediumBlue')
        elif i == 1:
            box.set_facecolor('BlueViolet')
            box.set_edgecolor('BlueViolet')
        elif i == 2:
            box.set_facecolor('DeepSkyBlue')
            box.set_edgecolor('DodgerBlue')

    for i, box in enumerate(row['medians']):
        box.set_color(color='Black')
        box.set_linewidth(2)
    for i, box in enumerate(row['whiskers']):
        box.set_color(color='Black')
        box.set_linewidth(2)

    xticklabels = [
        'Ablation Volume [ml] (Voxel-Based)',
        'Ablation Volume [ml] (Ellipsoid Formula)',
        'Ablation Volume [ml] (Manufacturers Brochure)'
    ]
    xtickNames = plt.setp(ax, xticklabels=xticklabels)
    plt.setp(xtickNames, fontsize=10, color='black')
    plt.ylim([-2, 150])
    plt.ylabel('Ablation Volume [ml]', fontsize=14, color='k')
    plt.tick_params(labelsize=10, color='black')
    ax.tick_params(colors='black', labelsize=10, color='k')
    ax.set_ylim([-2, 150])
    plt.title('Comparison of Ablation Volumes [ml] from MAVERRIC Dataset',
              fontsize=16)
    figpathHist = os.path.join("figures", "boxplot volumes")
    gh.save(figpathHist, ext=['png'], close=True)
Example #32
0
def nortest(df, a):
    _, sw = shapiro(df[a])
    _, ap = normaltest(df[a])
    index = ['Shapiro-Wilk', 'D\'Agostino-Pearson']
    columns = ['p-value']
    return pd.DataFrame([ap, sw], index=index, columns=columns)
Example #33
0
df = pd.read_csv('datasets/cats-data.csv', sep=",", index_col=0)
print(df)

df_female = df[df["Sex"] == "F"]
df_male = df[df["Sex"] == "M"]


def test_normal_distribution(p_group, alpha):
    return p_group > alpha


def test_hipothesis(p, alpha):
    return p > alpha


W, p_female = st.shapiro(df_female["Hwt"])
print('For female cats normal distribution test result is:',
      test_normal_distribution(p_female, alpha))
W, p_male = st.shapiro(df_male["Hwt"])
print('For male cats normal distribution test result is:',
      test_normal_distribution(p_male, alpha))

t, p = st.ttest_ind(df_female["Hwt"], df_male["Hwt"])
hypothesis_result = test_hipothesis(p, alpha)
print("Hipothesis that Heart weight for male and female heart is equal is: ",
      hypothesis_result)


def display_hist(data_female, data_male):
    data.plot.hist(bins=40)
    plt.legend(loc="upper right")
# H0 : M1=M2 ("There is no statistically significant difference between the Purchase averages of the two groups.")
# H1: M1 != M2 ("There is a statistically significant difference between the Purchase averages of the two groups.")
"""

# 2. Assumption Control

# 2.1. Normality Assumption (shapiro)
# Shapiro Wilk Test is used for the assumption of normality.
"""
# Defining hypothesis theses for the assumption of normality.
# H0 : Normality assumption is provided for this sample.
# H1 : Normality assumption is not provided for this sample.
"""

hf.hypothesis_test(
    shapiro(A))  # P-value = 0.5891, so that H0 can NOT be REJECTED!
hf.hypothesis_test(
    shapiro(B))  # P-value = 0.1541, so that H0 can NOT be REJECTED!

#Normality assumption is provided for both samples.

# 2.2 Variance Homogenity Assumption (levene)
"""
# Defining hypothesis theses for the Variance Homogenity Assumption.
# H0 : Variance homogeneity assumption is provided.
# H1 : Variance homogeneity assumption is NOT provided.
"""

hf.hypothesis_test(stats.levene(
    A, B))  # P-value = 0.1083, so that H0 can NOT be REJECTED!
def q1():
    # Retorne aqui o resultado da questão 1.
    p_value = sct.shapiro(sample_height)[1]
    return bool(p_value > 0.05)
Example #36
0
def q1():
    _, pvalue = sct.shapiro(get_sample(athletes, 'height', n=3000))
    return pvalue > 0.05
                interaction_len.append(max(times) if len(times) == 2 else sum(times[1:]))

        interaction_seq.append(interaction_count)
        appear_seq.append(appear_count)

    return [interaction_seq[i] / appear_seq[i] for i in range (len(interaction_seq))], \
        interaction_len

inter_per_class = {0:[], 1:[]}
propor_b, len_b = interact_length(inter_before, labels[:197])
propor_a, len_a = interact_length(inter_after, labels[197:])

# %%
# Normality | Non-parametric tests
import scipy.stats as stats
statistics_b, pvals = stats.shapiro(propor_b)
print (f'p-value (Shapiro Before): {pvals}, S: {statistics_b}')
print (f'df: {len(propor_b)}')

statistics_a, pvals = stats.shapiro(propor_a)
print (f'p-value (Shapiro After): {pvals}, S: {statistics_a}')
print (f'df: {len(propor_a)}')

statistics_u, pvals = stats.mannwhitneyu(propor_b, propor_a, alternative = 'less')
print ('p-value (Mann-Whitney U test): \t', pvals)
print (f'Before: {np.median(propor_b)}, After: {np.median(propor_a)}')
print (f'U: {statistics_u}')

# %%
df = pd.DataFrame(columns = ['Proportion', 'Treatment', 'Color'])
df['Treatment'] = ['Dataset 1'] * len(propor_b) + ['Dataset 2'] * len(propor_a)
         y_pred_L = gauss_to_pi(y_pred_gauss_mid_all, y_pred_gauss_dev_all, n_std_devs)

    # work out metrics
    y_U_cap = y_pred_U > y_val.reshape(-1)
    y_L_cap = y_pred_L < y_val.reshape(-1)
    y_all_cap = y_U_cap * y_L_cap
    PICP = np.sum(y_all_cap) / y_L_cap.shape[0]
    MPIW = np.mean(y_pred_U - y_pred_L)
    y_pred_mid = np.mean((y_pred_U, y_pred_L), axis=0)
    MSE = np.mean(np.square(Gen.scale_c * (y_pred_mid - y_val[:, 0])))
    RMSE = np.sqrt(MSE)
    CWC = np_QD_loss(y_val, y_pred_L, y_pred_U, alpha, soften, lambda_in)
    neg_log_like = gauss_neg_log_like(y_val, y_pred_gauss_mid,
                                      y_pred_gauss_dev, Gen.scale_c)
    residuals = residuals = y_pred_mid - y_val[:, 0]
    shapiro_W, shapiro_p = stats.shapiro(residuals[:])
    results_runs.append(
        (PICP, MPIW, CWC, RMSE, neg_log_like, shapiro_W, shapiro_p))

    # concatenate for graphs
    title = 'PICP=' + str(round(PICP,3))\
       + ', MPIW=' + str(round(MPIW,3)) \
       + ', qd_loss=' + str(round(CWC,3)) \
       + ', NLL=' + str(round(neg_log_like,3)) \
       + ', alpha=' + str(alpha) \
       + ', loss=' + NN.loss_type \
       + ', data=' + type_in + ',' \
       + '\nh_size=' + str(NN.h_size) \
       + ', bstraps=' + str(n_bootstraps) \
       + ', ensemb=' + str(n_ensemble) \
       + ', RMSE=' + str(round(RMSE,3)) \
Example #39
0
        n, then a must have length n/2.
    reta : bool, optional
        Whether or not to return the internally computed a values.  The
        default is False.
    
    Returns
    -------
    W : float
        The test statistic.
    p-value : float
        The p-value for the hypothesis test.
    a : array_like, optional
        If `reta` is True, then these are the internally computed "a"
        values that may be passed into this function on future calls.
"""
from scipy import stats
from matplotlib.finance import quotes_historical_yahoo
import numpy as np

ticker = 'IBM'
begdate = (2009, 1, 1)
enddate = (2013, 12, 31)
p = quotes_historical_yahoo(ticker,
                            begdate,
                            enddate,
                            asobject=True,
                            adjusted=True)
ret = (p.aclose[1:] - p.aclose[:-1]) / p.aclose[1:]
print 'ticker=', ticker, 'W-test, and P-value'
print stats.shapiro(ret)
Example #40
0
### Normality Tests
##### Histogram
In repository - Python-DataScience-CookBook/Exploratory Data Analysis.py
import seaborn as sns
sns.distplot(Df.Var.dropna())
##### Q-Q Plot
import numpy as np 
import pylab 
import scipy.stats as stats
stats.probplot(Df.Var, dist="norm", plot=pylab)
pylab.show()
##### Normal Test
k2, p = stats.normaltest(Energy.x) # k2 value corresponds to statistic value & p-value>0.05 implies data is normally distribution
##### Shapiro-Wilk Test - https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html
from scipy import stats
w,p = stats.shapiro(Df.Var)
##### D’Agostino’s K^2 Test - Kolmogorov-Smirnov test for goodness of fit
stats.kstest(Df.Var,'norm')
##### Anderson-Darling Test
stats.anderson((Df.Var,'norm')

### Correlation Tests
# H0: Two samples are independent
# H1: There is a dependency between the samples              
##### Pearson’s Correlation Coefficient
corr, p = pearsonr(Df.Var1, Df.Var2)       
##### Spearman’s Rank Correlation
corr, p = spearmanr(Df.Var1, Df.Var2)                    
##### Kendall’s Rank Correlation
corr, p = kendalltau(Df.Var1, Df.Var2)     
               
df = df.dropna()
df = df.iloc[:, 2:].apply(
    lambda x: x.astype(str).str.replace(',', '.').astype(float))

# 检验人均GDP和手机使用率的相关性
sns.regplot(x='GDP ($ per capita)', y='Phones (per 1000)', data=df)
plt.show()
'''
看起来有些像是线性关系,但是方差随着变量的值有所变动,看起来并不是同方差。另外,我们得检验一下,两个变量是不是接近正态分布的。
Scipy.stats中有多个方法可以用来检验正态分布,比如normaltest() 、shapiro()、kstest(rvs='norm')等,这里我们选用shapiro(),
分别检验各国人均GDP和手机使用率是否符合正态分布。
原假设:样本来自一个正态分布的总体。
备选假设:样本不来自一个正态分布的总体。
'''
print(stats.shapiro(df['GDP ($ per capita)']))
# (0.8052586317062378, 3.5005310282387736e-14)
print(stats.shapiro(df['Phones (per 1000)']))
# (0.8678628206253052, 2.0484371143769664e-11)
# 返回的结果是一个包含统计量w和p-值的元组。可以看到,p-值非常小,接近于0,于是可以拒绝原假设。
# 我们认为各国人均GDP和手机使用率都不符合正态分布。

# 用Pandas计算相关系数
'''
低度相关:0 <= |r| <= 0.3
中度相关:0.3 <= |r| <= 0.8
高度相关:0.8 <= |r| <= 1
'''
# 因 各国人均GDP和手机使用率都不符合正态分布,所以 不适用皮尔森相似度pearson
print(df['GDP ($ per capita)'].corr(df['Phones (per 1000)'], method='pearson'))
# 0.88352010541116632
###########################1###########################

import pandas as pd
import scipy 
from scipy import stats

cutlets=pd.read_csv("C:\\Users\\jzsim\\Downloads\\Cutlets.csv")

#as there are 2 population here comparion with each other.
# checking if both are following normal distribtuion or not.
# doing the same by shapiro test
#H0 : Follworing normal distribution
#Ha : Not Follworing normal distribution
print(stats.shapiro(cutlets['Unit A'])) 
# p Value: 0.3199819028377533
#as P value is greater than 0.05
# P high Null Fly
print(stats.shapiro(cutlets['Unit B'])) 
# p Value: 0.3199819028377533
#as P value is greater than 0.05
# P high Null Fly

#AS BOTH P VALUES ARE GREATER THAN 0.05 P HIGH NULL FLY 
#DATA IS FOLLWOING NORMAL DISTRIBUTION
 
#are external conditions same --> No


# Checking Variances are equal or not
#H0 : VAriances are equal
Example #43
0
def resid_proc(reis, remove_zero_wt, grpfiles, pareto, groups_rei):
    print "aggregating statistics and plotting by observation group..."
    print "PEST iteration:"
    for cf in reis:
        print '{0} '.format(cf),
        infile = reis[cf]

        # open a pointer to the output file
        rei_summary_folder='residuals_summaries'
        if not os.path.exists(rei_summary_folder):
            os.makedirs(rei_summary_folder)
            
        ofp = open(os.path.join(rei_summary_folder,infile + '_residuals_summary.dat'),'w')
        ofp.write('Residuals Summary information for -> ' + infile + '\n')

        # read in the data
        alldat = np.genfromtxt(infile,names=True,skip_header=4,dtype=None)
                # if processing PEST pareto results, read in groups from another REI
        if pareto:
            try:
                rei_groups_df = pd.read_csv(groups_rei, delim_whitespace=True, skiprows=6, index_col='Name')
                #if np.isnan(np.max(rei_groups_df.ix[:,0])):
                    #rei_groups_df = rei_groups_df[rei_groups_df.columns[1:]]
                    # for observations that were read in, reassign the entry in 'Group' column to group from other REI
                for observation in alldat:
                    observation['Group'] = rei_groups_df.ix[observation['Name'], 'Group']

            except IOError:
                print "Cannot open {0}. Please provide an non-pareto REI file so that observations can be analyzed by group."
                quit()
        # find the unique list of groups by which plots and stats will be managed
        allgrps = np.unique(alldat['Group'])
        allgrps = [g for g in allgrps if 'regul' not in g]    
        
        # loop over the groups
        for cg in allgrps:
            # identify indices of the current group
            tmpinds = np.nonzero(alldat['Group']==cg)[0]
            if remove_zero_wt:
                inds = tmpinds[np.nonzero(alldat['Weight'][tmpinds] != 0)]
                
                # not sure what the "remove_zero_weight" option is for, but for groups
                # that are zero weighted, it results in an empty "inds" array, causing python to crash
                if len(inds)==0:
                    inds = tmpinds
            else:
                inds = tmpinds
            # pull out the measured values for the group
            cmeas = alldat['Measured'][inds]
            # pull out the modeled values for the group
            cmod =  alldat['Modelled'][inds]
            
            #get some values to limit plotting areas
            try:
                cmin = np.min([cmeas,cmod])
                cmax = np.max([cmeas,cmod])
            # if the last rei is from an iteration where PEST failed, will have unreasonable values (i.e. -1e300)
            # that will cause a TypeError here
            except TypeError:
                continue


        
            # now calculate statistics on the residuals
            
            # first grab the residuals
            cres = alldat['Residual'][inds]
            
            # next calculate the relevant statistics and write to the output file
            cmean = np.mean(cres)
            cstd  = np.std(cres)
            cvar  = np.var(cres)
            cmed  = np.median(cres)
            cmin  = np.min(cres)
            camin = np.min(np.abs(cres))
            cmax  = np.max(cres)
            camax = np.max(np.abs(cres))

            if len(grpfiles) > 1:
            # make a plot of modeled vs. measured
                plt.figure()
                plt.hold = True

                plt.plot(cmeas,cmod,'bx')
                plt.plot([cmin,cmax],[cmin,cmax],'r')
                plt.title('Observation Group "%s", PEST iteration %s' %(cg, cf))
                plt.xlabel('Measured')
                plt.ylabel('Modeled')
                # append the histograms into the proper PDF file
                grpfiles[cg][0].savefig()
                #plt.close()

            # finally plot the histogram and save it
            fig = plt.figure()
            ax = fig.add_subplot(111)
            n, bins, patches = ax.hist(cres, 50, facecolor='blue', alpha=0.75)
            ax.set_xlabel('Residual Value')
            ax.set_ylabel('Count')
            ax.set_title(cg + ' iteration ' + str(cf))
            ax.set_xlim([cmin,cmax])
            # append the histograms into the proper PDF file
            grpfiles[cg][-1].savefig()
            #plt.close()

            # perform the Shapiro-Wilks test for normality of the residuals
            if len(cres)>2:
                W,p = shapiro(cres)
            
            if len(cres) > 2:
                W,p = shapiro(cres)
            else:
                p = -99999
                
            # write to the summary output file
            ofp.write(25*'#' + '\n')
            ofp.write('Summary Statistics for Residuals: -> group ' + cg +'\n')
            ofp.write('%14s : %f\n' %('mean',cmean))
            ofp.write('%14s : %f\n' %('median',cmed))
            ofp.write('%14s : %f\n' %('std deviation',cstd))
            ofp.write('%14s : %f\n' %('variance',cvar))
            ofp.write('%14s : %f\n' %('min',cmin))
            ofp.write('%14s : %f\n' %('max',cmax))
            ofp.write('%14s : %f\n' %('min (absolute)',camin))
            ofp.write('%14s : %f\n' %('max (absolute)',camax))
    
            if p > 0.05:
                ofp.write('Residuals are not normally distributed\n')
            else:
                ofp.write('Residuals are normally distributed\n')
            ofp.write('p-value = %f' %(p))
            
            if p > 0.05:
                ofp.write('Residuals are not normally distributed\n')
                ofp.write('p-value = %f' %(p))
            elif p < -99:
                ofp.write('Residuals normality not calculable: Too few residuals in group\n')
            else:
                ofp.write('Residuals are normally distributed\n')
                ofp.write('p-value = %f' %(p))
    
            ofp.write(3*'\n')
        ofp.close()
    # close the PDF files
    for cg in grpfiles:
        for i in range(len(grpfiles)):
            grpfiles[cg][i].close()
Example #44
0
#Superposition des lignes de régression chez les Oecanthus exclamationis (rouge) et chez les Oecanthus niveus (bleu)
ax1 = sns.regplot(x="TempEx", y="ImpulsionEx", data=Crickets, color='r')
ax2 = sns.regplot(x="TempNiv", y="ImpulsionNiv", data=Crickets, color='b')

# La ligne de régression pour Oecanthus exclamationis est plus élevée que la ligne pour Oecanthus niveus; cela signifie que Oecanthus exclamationis aurait un taux de pouls plus élevé à n’importe quelle température.

# La première hypothèse nulle de l’ancova est que les pentes des lignes de régression sont toutes égales; en d’autres termes, que les lignes de régression sont parallèles les unes aux autres. On va acceptez l’hypothèse nulle selon laquelle les lignes de régression sont parallèles et nous testerons la deuxième hypothèse nulle : que les interceptions des lignes de régression sont toutes les mêmes.

# Les pentes ne sont pas significativement différentes (P=0,25); la pente commune est de 3,60, ce qui se trouve entre les pentes pour les lignes séparées (3,52 et 3,75). Sur cette partie-là, je n'ai pas réussi à tester cette hypothèse.

# Ancova fait les mêmes hypothèses que la régression linéaire : normalité et homoscédasticité de Y pour chaque valeur de X, et indépendance. Vérifions au moins l'hypothèse de normalité.

# In[202]:

#Test de Shapiro chez les Oecanthus exclamationis
stats.shapiro(model1.resid)

# W= 0.9727, p= 0.9105 donc les résidus sont bien distribués suivant la loi normale chez les Oecanthus exclamationis

# In[203]:

#Test de Shapiro chez les Oecanthus niveus
stats.shapiro(model2.resid)

# W= 0.9159, p= 0.1259 donc les résidus sont bien distribués suivant la loi normale chez les Oecanthus niveus

# Maintenant procédons à un test de Tukey sous l'hypothèse que leurs pentes sont toutes les mêmes

# In[188]:

from statsmodels.stats.multicomp import pairwise_tukeyhsd
Example #45
0
def meta_process(tau):
    '''
    Main processing kernel
    '''
    print('Analyzing tau (ms): ', tau)

    import warnings
    warnings.filterwarnings('ignore')

    # Folder where you store the PLT positions (center of mass - COM) per DNS time steps
    data_location = which_bodies + '_tau_' + str(tau) + '/'

    numBodies = 0
    Bodies = []
    for log in sorted(glob.glob(data_location + '*_ComPos.log'), key=numericalSort):
        Bodies.append(numBodies)
        numBodies += 1

    # Build the data frames and fill them
    absolute_pos  = pd.DataFrame(columns=Bodies, dtype=np.float64)
    distFromWalls = pd.DataFrame(columns=Bodies, dtype=np.float64)
    MSD           = pd.DataFrame(columns=Bodies, dtype=np.float64)
    
    # Perform distributions checking in zones
    zones_vels          = []
    zones_distros       = []
    zones_MSD           = []
    zones_distFromWalls = []
    for z in range(zones_):
        zones_distros.append(np.array([], dtype=np.float64))
        zones_vels.append(np.array([], dtype=np.float64))
        zones_MSD.append(np.array([], dtype=np.float64))
        zones_distFromWalls.append(np.array([], dtype=np.float64))

    # Var that help us find the mean free path/ time (MFP/T) in comparison with the ground truth (gT, path from DNS)
    integrals_tau = []
    
    numBodies = 0
    for log in sorted(glob.glob(data_location + '*_ComPos.log'), key=numericalSort):

        df = pd.read_csv(log, delimiter=',', header=None, names=names_, usecols=usecols_, dtype={'t': np.float64, 'y': np.float64, 'z': np.float64})

        # Time in the original files interprets to how many DNS fluid time steps,
        # this is why we multiply here with DNS fluid time step to convert it into physical time in ms
        df = df.loc[df['t']*dt_f >= From_]
        df = df.loc[df['t']*dt_f <= To_]
        df = df.reset_index(drop=True)

        absolute_pos[numBodies] = df['y'].copy()

        if (do_what == 'MFP'):
            integrals_tau.append(np.trapz(df['y'], df['t']*dt_f))
        
        if ( (do_what == 'distros') or (do_what == 'MSD') or (do_what == 'distFromWalls') ):

            MSD[numBodies] =  pd.Series((df['y'] - df['y'].iloc[0]) * (df['y'] - df['y'].iloc[0]))

            distFromWalls[numBodies] = df['y'].apply(lambda y: (y - bottom_wall) if ( (y - bottom_wall) < (top_wall - y) ) else (top_wall - y))

            pos        = absolute_pos[numBodies].to_numpy()
            pos_rolled = np.roll(pos, 1)

            # velocity in um/ms
            vel = (pos - pos_rolled) / tau
            vel[0] = np.nan
            
            # Exclude erroneous jumps
            dp = np.absolute(pos-pos_rolled)
            inds = np.where(dp < ((top_wall - bottom_wall) - 5.0))
            pos = pos[inds]
            vel = vel[inds]
            
            inds = np.where((~np.isnan(vel)) & (~np.isinf(vel)))
            pos = pos[inds]
            vel = vel[inds]

            zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1)
            for z in range(zones_):
                inds = np.where((pos >= zones_tmp[z]) & (pos < zones_tmp[z+1]))
                zones_distros[z] = np.append(zones_distros[z], vel[inds])
        
        numBodies += 1
    #######################################################################

    #######################################################################
    df_t = np.arange(From_, To_+tau, tau)
    #######################################################################

    #######################################################################
    # Compute MSD & distFromWalls per Zone
    if ( (do_what == 'distros') or (do_what == 'MSD') or (do_what == 'distFromWalls') ):

        zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1)
        MSD_t_avg = []
        distFromWalls_t_avg = []
        for z in range(zones_):
            MSD_t_avg.append([])
            distFromWalls_t_avg.append([])

        for i, t_ in enumerate(df_t):
            for b_ in range(len(Bodies)):
                try:
                    pos = absolute_pos[b_].iloc[i]
                except:
                    continue

                for z in range(zones_):
                    if ( (pos >= zones_tmp[z]) and (pos < zones_tmp[z+1]) ):
                        MSD_t_avg[z].append(MSD[b_].iloc[i])
                        distFromWalls_t_avg[z].append(distFromWalls[b_].iloc[i])
            
            for z in range(zones_):
                # If no particles in the zone, then np.mean returns nan
                zones_MSD[z] = np.append(zones_MSD[z], np.mean(MSD_t_avg[z]))
                MSD_t_avg[z] = []

                zones_distFromWalls[z] = np.append(zones_distFromWalls[z], np.mean(distFromWalls_t_avg[z]))
                distFromWalls_t_avg[z] = []
        
        # Cleaning
        for z in range(zones_):
            if (np.where(np.isnan(zones_MSD[z]))[0].shape[0] != 0):
                zones_MSD[z]           = zones_MSD[z][:np.where(np.isnan(zones_MSD[z]))[0][0]]
            if (np.where(np.isnan(zones_distFromWalls[z]))[0].shape[0] != 0):
                zones_distFromWalls[z] = zones_distFromWalls[z][:np.where(np.isnan(zones_distFromWalls[z]))[0][0]]

        for z in range(zones_):
            
            from scipy import optimize
            # non_linear fitting
            def non_linear_(x, a, b):
                return a*np.power(x, b)
            # linear fitting
            def linear_(x, a, b):
                return a*x + b

            # MSD
            Y = zones_MSD[z]
            X = np.copy(df_t)[:Y.shape[0]]
            X -= From_

            best_vals_non_linear, _ = optimize.curve_fit(non_linear_, X, Y)
            #best_vals_linear    , _ = optimize.curve_fit(linear_    , X, Y)

            zones_MSD[z] = tuple(best_vals_non_linear)

            if (do_what == 'MSD'):
                # Dump data
                #np.savetxt(fName_ + '.csv', np.array(list(zip(X,Y))), delimiter=',')
                plt.plot(X,Y)
                plt.plot(X, non_linear_(X, *best_vals_non_linear), linestyle='--', label='non-linear (a*x^b), params as [a,b] : '+str(best_vals_non_linear))
                #plt.plot(X, linear_(X, *best_vals_linear)        , linestyle='--', label='linear (a*x + b), params as [a,b] : '+str(best_vals_linear))
                plt.legend()
                plt.show()


            # distFromWalls
            Y = zones_distFromWalls[z]
            X = np.copy(df_t)[:Y.shape[0]]
            X -= From_
            
            #best_vals_non_linear, _ = optimize.curve_fit(non_linear_, X, Y)
            best_vals_linear    , _ = optimize.curve_fit(linear_    , X, Y)
            
            zones_distFromWalls[z] = tuple(best_vals_linear)

            if (do_what == 'distFromWalls'):
                # Dump data
                #np.savetxt(fName_ + '.csv', np.array(list(zip(X,Y))), delimiter=',')
                plt.plot(X,Y)
                #plt.plot(X, non_linear_(X, *best_vals_non_linear), linestyle='--', label='non-linear (a*x^b), params as [a,b] : '+str(best_vals_non_linear))
                plt.plot(X, linear_(X, *best_vals_linear)        , linestyle='--', label='linear (a*x + b), params as [a,b] : '+str(best_vals_linear))
                plt.legend()
                plt.show()
    #######################################################################

    #######################################################################
    if (do_what == 'distros'):

        # significance level for p-values
        sign_lvl = 0.1

        # For the PLT random walk simulations
        distros_invECDF = []
        distros_tail    = []
        xmins           = []

        zones_tmp = np.linspace(bottom_CFL, top_CFL, zones_+1)
        for z in range(zones_):
            print("#######################################################################")
            print("Zone ", z)
            print("Limits: (", zones_tmp[z], ",", zones_tmp[z+1], ")")
            print('------------------------------------------------------------')

            data = np.absolute(zones_distros[z])
            print("Mean absolute velocity (current zone) [um/ms]                             : ", np.mean(data))
            print("Diffusion Coefficient (v^2*dt*0.5) [um^2/ms]                              : ", (np.mean(data)**2.)*tau*0.5)
            print("MSD non-linear fitting (a*x^b), params as (a,b) [um^2,ms]                 : ", zones_MSD[z])
            print("Avg Distance from Walls linear fitting (a*x + b), params as (a,b) [um,ms] : ", zones_distFromWalls[z])

            print('------------------------------------------------------------')
            print("Checking for sign.")
            data = zones_distros[z]

            sign_ = np.sign(data)
            positive_ = sign_[sign_ > 0.]
            negative_ = sign_[sign_ < 0.]

            print('Positive velocities (%) : ' , round(positive_.shape[0]/sign_.shape[0], 2) * 100.)
            print('Negative velocities (%) : ' , round(negative_.shape[0]/sign_.shape[0], 2) * 100.)

            print('------------------------------------------------------------')
            print("Checking for normality.")
            
            not_normal = 0
            normal     = 0

            # Shapiro-Wilk Test
            stat, p = stats.shapiro(data)
            if (p > sign_lvl):
                normal += 1
            else:
                not_normal += 1

            # D’Agostino’s K^2 Test
            stat, p = stats.normaltest(data)
            if (p > sign_lvl):
                normal += 1
            else:
                not_normal += 1

            # Anderson-Darling Test
            result = stats.anderson(data)
            for i in range(len(result.critical_values)):
                if result.statistic < result.critical_values[i]:
                    normal += 1
                else:
                    not_normal += 1

            kurt = stats.kurtosis(data)
            print('kurtosis of dataset (whole range, i.e., body & tail) : ', kurt)
            print('Number of successful normality tests                 : ', normal)
            print('Number of failed normality tests                     : ', not_normal)

            print("End of Checking for normality.")

            print('------------------------------------------------------------')
            print("Analyze the tail of the distribution.")
            
            data = np.absolute(zones_distros[z])

            from statsmodels.distributions.empirical_distribution import ECDF, monotone_fn_inverter
            data.sort() # in-place sorting
            ecdf = ECDF(data)
            inv_ecdf = monotone_fn_inverter(ecdf, data)
            distros_invECDF.append({'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))})
            
            #######################################################################
            tail_P = 0.90 # no need to search the whole domain for the lower bound (x_min). Search from the 90th percentile and above.
            print("Number of samples to do statistics (whole range, i.e., body & tail) : ", data.shape[0])
            print("Number of samples to do statistics (tail-only)                      : ", data[data >= inv_ecdf(tail_P)].shape[0])
            #######################################################################

            print('------------------------------------------------------------')
            # https://en.wikipedia.org/wiki/Heavy-tailed_distribution#Common_heavy-tailed_distributions
            # We focus on fat-tails and more specifically on power laws (see paper for more)
            # heavy-tails term: kept it for legacy reasons
            wikipedia_heavy_tailed_distros = [
                'halfcauchy',
                'burr12', 'burr',
                'pareto',
                'lognorm',
                'weibull_min',
                'fisk',
                'invweibull',
                'levy',
                'invgauss' # see Klaus_2011 (Statistical Analyses Support Power Law Distributions Found in Neuronal Avalanches)
            ]

            handpicked_distros = wikipedia_heavy_tailed_distros + ['expon', 'halfnorm']

            for dist_name in handpicked_distros:
                print(dist_name)
                distro = eval('stats.' + dist_name)

                '''
                if (distro.numargs >= 2):
                    print('Skip distro.')
                    print('Avoid overfitting from distros with multiple parameters (numargs >= 2).')
                    print('------------------------------------------------------------')
                    continue
                '''

                if ( (distro.a < 0.) or (distro.b != np.inf) ):
                    print('Skip distro.')
                    print('Bounds not appropriate.')
                    print('------------------------------------------------------------')
                    continue

                #######################################################################
                # Optimal fitting
                # Computationally expensive part!
                if (dist_name != 'halfnorm'):
                    xmin_optimal = find_xminOpt_distro(data[data >= inv_ecdf(tail_P)], dist_name)
                else:
                    xmin_optimal = 0.
                #######################################################################

                #######################################################################
                # Relaxed fitting based on optimal one
                # When ecdf(xmin_opt) > 95%, it's a good idea to try a relaxed version
                # at ecdf(xmin_opt) ~ 90%
                if (dist_name != 'halfnorm'):
                    # round down
                    tail_i = 0.04
                    xmin_relaxed_lb = inv_ecdf( (int(ecdf(xmin_optimal)*10.) / 10.) - tail_i/2. )
                    xmin_relaxed_ub = inv_ecdf( (int(ecdf(xmin_optimal)*10.) / 10.) + tail_i/2. )
                    # More educated choice of xmin_relaxed
                    xmin_relaxed = find_xminOpt_distro(data[data >= xmin_relaxed_lb], dist_name, xmin_relaxed_ub)
                else:
                    xmin_relaxed = 0.
                #######################################################################

                data_optimal = data[data >= xmin_optimal]
                params_optimal = distro.fit(data_optimal)

                data_relaxed = data[data >= xmin_relaxed]
                params_relaxed = distro.fit(data_relaxed)

                #*** KS-test
                p_val_optimal = stats.kstest(data_optimal, dist_name, params_optimal)[1]
                p_val_relaxed = stats.kstest(data_relaxed, dist_name, params_relaxed)[1]
                #***

                strongly_rejected_opt = False
                negative_d = 'None'
                negative_p = 1.
                for dist_name_ in handpicked_distros:
                    if (dist_name_ == dist_name):
                        continue
                    # Check dist_name vs dist_name_
                    # Which model is better fit
                    LLR, p = LLR_test(data_optimal, dist_name, dist_name_)
                    if ( (LLR < 0.) and (p < negative_p) ):
                        negative_d = dist_name_
                        negative_p = p

                # significance lvl as in Klaus_2011 (Statistical Analyses Support Power Law Distributions Found in Neuronal Avalanches)
                if ( negative_p < 0.01 ):
                    strongly_rejected_opt = True

                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('Optimal fitting                             ')
                print('Number of samples xmin_optimal            : ', data_optimal.shape[0])
                print('params_optimal                            : ', params_optimal)
                print('xmin_optimal                              : ', xmin_optimal)
                print('ecdf(xmin_optimal)                        : ', round(ecdf(xmin_optimal)*100, 2), ' (%)')
                print('(p-val) kstest - tail only - xmin_optimal : ', round(p_val_optimal, 2))
                print(' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .')
                print('strongly_rejected                         : ', 'True' if (strongly_rejected_opt) else 'False')
                print('As good as possible alternative (dist,p)  : ', (negative_d, round(negative_p,5)))

                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('Relaxed fitting                             ')
                print('Number of samples xmin_relaxed            : ', data_relaxed.shape[0])
                print('params_relaxed                            : ', params_relaxed)
                print('xmin_relaxed                              : ', xmin_relaxed)
                print('ecdf(xmin_relaxed)                        : ', round(ecdf(xmin_relaxed)*100, 2), ' (%)')
                print('(p-val) kstest - tail only - xmin_relaxed : ', round(p_val_relaxed, 2))

                print(' . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .')
                relaxed_accept = 0
                repeat_ = 2500 # See Clauset_2009 (Power-Law Distributions in Empirical Data)
                for _ in range(repeat_):
                    synthetic_data = inv_ecdf(np.random.uniform(ecdf(np.min(data)), ecdf(np.max(data)), size=data.shape[0]))
                    toCompare_with = inv_ecdf(np.random.uniform(ecdf(np.min(data)), ecdf(np.max(data)), size=data.shape[0]))

                    # 1. optimal model: simulates the observed data with the ecdf up to xmin_optimal and then with the selected distro
                    # 2. relaxed model: simulates the observed data with the ecdf up to xmin_relaxed and then with the selected distro
                    # The reference model is the optimal one.
                    
                    optimal_model = np.copy(synthetic_data)
                    inds = np.where(optimal_model >= xmin_optimal)
                    optimal_model[inds] = distro.rvs(*params_optimal, size=inds[0].shape[0])
                    optimal_model = optimal_model[ (~np.isnan(optimal_model)) & (~np.isinf(optimal_model)) ]
                    optimal_model = optimal_model[ optimal_model < (((top_wall - bottom_wall) - 5.0) / tau) ]
                    D_opt = astats.kuiper_two(toCompare_with, optimal_model)[0]

                    relaxed_model = np.copy(synthetic_data)
                    inds = np.where(relaxed_model >= xmin_relaxed)
                    relaxed_model[inds] = distro.rvs(*params_relaxed, size=inds[0].shape[0])
                    relaxed_model = relaxed_model[ (~np.isnan(relaxed_model)) & (~np.isinf(relaxed_model)) ]
                    relaxed_model = relaxed_model[ relaxed_model < (((top_wall - bottom_wall) - 5.0) / tau) ]
                    D_rel = astats.kuiper_two(toCompare_with, relaxed_model)[0]

                    if (D_rel <= D_opt):
                        relaxed_accept += 1

                p_val_relaxed = round(relaxed_accept/repeat_, 2)
                print('p-val of relaxed model                    : ', p_val_relaxed)

                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('One-Zone Simulation for optimal model.')
                # int(4808*0.82): 4808 number of activated PLTs per ul (see Chopard_2017 - A physical description of the adhesion and aggregation of platelets). We deal with 0.82ul -> 4808*0.82
                # tau is the time step of the random walks in ms
                # 820um is the height of Impact-R PLT function analyser (and thus the *0.82)
                PLTs_ = PLTs(int(4808*0.82), tau, 820.0, [{'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}], [{'distro':distro, 'params':params_optimal}], [xmin_optimal])
                try:
                    PLTs_.advance(int(20000/tau))
                    depositedPLTs_opt = int(PLTs_.depositedPLTs()/0.82)
                    MSD_fitting_prms, distFromWalls_prms = PLTs_.meta_quantities()
                except:
                    depositedPLTs_opt = 0
                    MSD_fitting_prms, distFromWalls_prms  = (), ()
                print('deposited PLTs (per uL)                   : ', depositedPLTs_opt)
                print('MSD non-linear fitting [um^2,ms]          : ', MSD_fitting_prms)
                print("Avg Dist Walls linear fitting [um,ms]     : ", distFromWalls_prms)
                
                print('_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._.')
                print('One-Zone Simulation for relaxed model.')
                PLTs_ = PLTs(int(4808*0.82), tau, 820.0, [{'inv_ecdf':inv_ecdf, 'lb':ecdf(np.min(data)), 'ub':ecdf(np.max(data))}], [{'distro':distro, 'params':params_relaxed}], [xmin_relaxed])
                try:
                    PLTs_.advance(int(20000/tau))
                    depositedPLTs_rel = int(PLTs_.depositedPLTs()/0.82)
                    MSD_fitting_prms, distFromWalls_prms = PLTs_.meta_quantities()
                except:
                    depositedPLTs_rel = 0
                    MSD_fitting_prms, distFromWalls_prms  = (), ()
                print('deposited PLTs (per uL)                   : ', depositedPLTs_rel)
                print('MSD non-linear fitting [um^2,ms]          : ', MSD_fitting_prms)
                print("Avg Dist Walls linear fitting [um,ms]     : ", distFromWalls_prms)

                print('------------------------------------------------------------')
  
            print("#######################################################################")
    #######################################################################

    #######################################################################
    if (do_what == 'MFP'):
        avg_ = 0.
        for PLT in Bodies:
            avg_ += (abs(integrals_gT[PLT] - integrals_tau[PLT]) / abs(integrals_gT[PLT])) * 100.
        avg_ /= numBodies
        ground_truth_diff.append(avg_)
Example #46
0
st.probplot(mdf.resid, plot=ax)
plt.show()

fig = plt.figure(figsize=(16, 9))
ax = sns.distplot(mdf.resid,
                  hist=False,
                  kde_kws={
                      "shade": True,
                      "lw": 1
                  },
                  fit=st.norm)
ax.set_xlabel("Residuals")
plt.show()

labels = ["Statistic", "p-value"]
norm_res = st.shapiro(mdf.resid)
for key, val in dict(zip(labels, norm_res)).items():
    print(key, val)

fig = plt.figure(figsize=(16, 9))
ax = sns.scatterplot(y=mdf.resid, x=mdf.fittedvalues)
ax.set_xlabel("Fitted Values")
ax.set_ylabel("Residuals")
plt.show()

het_white_res = het_white(mdf.resid, mdf.model.exog)
labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"]
for key, val in dict(zip(labels, het_white_res)).items():
    print(key, val)

fig = plt.figure(figsize=(16, 9))
Example #47
0
def frequency_increment_test(time, values, clip=True):
    Y = frequency_increment_values(time, values, clip=clip)
    T, Tp = stats.ttest_1samp(Y, 0)
    W, Wp = stats.shapiro(Y)
    return {"T": T, "Tp": Tp, "W": W, "Wp": Wp}
for i, test_region in enumerate(test_regions):

    print(test_region)
    DATA_region = DATA_TS[(DATA_TS['Region'] == test_region)
                          & (DATA_TS['Year'] < 2011) &
                          (DATA_TS['Year'] > 1970)]

    climateData = np.array(DATA_region['Norm_ImpFix_2y_offset'])
    auto_corr = test_autocorrelation(climateData)

    if normClim is True:

        climateData = climateData / np.nanmax(climateData)

    t, shap_log = shapiro(np.log(climateData))

    t, shap_norm = shapiro(climateData)

    best_model, y, x, maxi, pearson_corr, best_loo, loos, combs = find_best_model(
        climateData, telecon)

    comb_df = pd.DataFrame(combs)
    comb_df = comb_df.T

    loo_df = pd.DataFrame(loos)
    loo_df = loo_df.T
    loo_df.columns = ['log', 'identity', 'inverse-power']
    loo_df['combination'] = comb_df.iloc[:, 0]
    # store LooCV out-of-sample-errors
    loo_df.to_csv(
# Analysis for Cluster 1:
for i in cluster1:
    #Maximum flow algorithm:
    flow_value,flow_dict = nx.maximum_flow(G, 0, i, capacity='weight')
    c1_values.append(flow_value)

    df=pd.DataFrame({'Cluster':[1],
                    'Flow_Value':flow_value})

    all_data=all_data.append(df)

mean=np.mean(c1_values)
std_dev=np.std(c1_values)

normality_test=stats.shapiro(c1_values)

print("Mean for Cluster 1:",mean)
print("Standard deviation for Cluster 1:",std_dev)
print("Normality test for Cluster 1:",normality_test,"\n")

#Histogram for cluster 1:
hist, bin_edges=np.histogram(c1_values,density=True)
first_edge, last_edge = np.min(c1_values),np.max(c1_values)

n_equal_bins = 15
bin_edges = np.linspace(start=first_edge, stop=last_edge,num=n_equal_bins + 1, endpoint=True)

plt.hist(c1_values,bins=bin_edges,rwidth=0.75)
plt.xlabel('Flow values')
plt.ylabel('Frequency')
Example #50
0
#
# The simplest transformation is Standard Scaling (or Z-score normalization):
#
# $$ \large z= \frac{x-\mu}{\sigma} $$
#
# Note that Standard Scaling does not make the distribution normal in the strict sense.

# In[ ]:

from sklearn.preprocessing import StandardScaler
from scipy.stats import beta
from scipy.stats import shapiro
import numpy as np

data = beta(1, 10).rvs(1000).reshape(-1, 1)
shapiro(data)

# In[ ]:

# Value of the statistic, p-value
shapiro(StandardScaler().fit_transform(data))

# With such p-value we'd have to reject the null hypothesis of normality of the data

# But, to some extent, it protects against outliers:

# In[ ]:

data = np.array([1, 1, 0, -1, 2, 1, 2, 3, -2, 4,
                 100]).reshape(-1, 1).astype(np.float64)
StandardScaler().fit_transform(data)
        # print(data.head())

        lm = ols(formula='percentual_k_unordered ~ algoritmo', data=data).fit()
        anova = sm.stats.anova_lm(lm, typ=2)  # Type 2 ANOVA DataFrame

        tit = ' ANOVA para Probabilidade = %s e Tamanho = %s' % (prob, tam)
        hr = '=' * 60  #len(tit)
        anov = anova.head(10)

        s = '%s\n%s\n%s\n%s\n\n' % (hr, tit, hr, anov)
        arq_destino.write(s)
        print(s)

        #insere dados do Tete de Nomelidade
        s = '    * TESTE DE NORMALIDADE (SHAPIRO-WILK):\n'
        s += '      %s\n' % ('-' * (len(s) + 6))
        for alg in udata.ALGORTIMOS:
            d = data[data['algoritmo'] == alg]['percentual_k_unordered']
            W, p_value = stats.shapiro(d)
            s += '        - %s: W = %0.6f / p_value = %.6f \n' % (alg.ljust(9),
                                                                  W, p_value)
        s += '\n'
        arq_destino.write(s)
        print(s)

#fecha arquivo
arq_destino.close()

# pr_f = anova['PR(>F)'].values[0]
# print( '%s  /  %.55f' % (pr_f, pr_f) )
Example #52
0
def runShapiroTest(data, alpha):
    stats, pValue = shapiro(data)
    print('Statistics: {} | pValue: {} | Is Parametric: {}'.format(stats, pValue, pValue > alpha))

    return pValue > alpha
print(data.Age.describe())

# Calculates the z score of each value
print(st.zscore([0.45, 23, 25, 28, 33, 60, 80]))

#  z score of a p-value and vice versa
print(st.norm.cdf(3.46))
print(st.norm.ppf(.95))
print(st.norm.cdf(1.64))

# Normality test
sm.qqplot(data.Age, line='45')
pylab.show()

data_no_missing = data.dropna()
stat, p = st.shapiro(data_no_missing.Age)
print('Statistics=%.3f, p=%.3f' % (stat, p))
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

# Embarked crosstab
print(pd.crosstab(index=data["Embarked"], columns="Count"))
print(data.Embarked.isnull().sum())

# Embarked barchart
sns.countplot(x="Embarked", data=data)
plt.show()
Example #54
0
for A, B in zip(sampleA, sampleB):
    fdata.write('\n' + A + ' is sample A' + '\n' + B + ' is sample B' + '\n')
    fdata.write('RVL comparaison' + '\n')
    fdata.write(
        str(np.mean(RVLdata_[A])) + '+-' + str(np.std(RVLdata_[A])) + '\n')
    fdata.write(
        str(np.mean(RVLdata_[B])) + '+-' + str(np.std(RVLdata_[B])) + '\n')
    print 'means'

    ###T-test
    pops = []
    pops.append(RVLdata_[A])
    pops.append(RVLdata_[B])
    ###Shapiro's test for normality for sample A
    w, pnormA = stats.shapiro(np.array(RVLdata_[A]))
    if pnormA > 0.05:
        normA = True
        print 'A sample IS normally distributed'
        fdata.write('A sample IS normally distributed' + '\n')
    else:
        normA = False
        print 'A sample is NOT normally distributed'
        fdata.write('A sample is NOT normally distributed' + '\n')
    ###Shapiro's test for normality for sample B
    w, pnormB = stats.shapiro(np.array(RVLdata_[B]))
    if pnormB > 0.05:
        normB = True
        print 'B sample IS normally distributed'
        fdata.write('B sample IS normally distributed' + '\n')
    else:
from scipy.stats import shapiro
import matplotlib.pyplot as plt
from pandas.core.frame import DataFrame

data = np.genfromtxt("data.csv", delimiter=",")
listp = []
listw = []
listf = []
listr = [
    "Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", "Leu",
    "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr"
]
for i in range(0, 20):
    aa = data[1:, i]
    fig = plt.figure
    res = stats.probplot(aa, plot=plt)
    plt.show()
    w, p = shapiro(aa)
    listw.append(w)
    listp.append(p)
    if p >= 0.05:
        print("normal distribution")
        listf.append("normal distribution")
    else:
        print("abnormal distribution")
        listf.append("abnormal distribution")
    print("w:%f" % w, "p.value:%f" % p)
dic = {"residue index": listr, "W": listw, "P.value": listp, "F": listf}
output = DataFrame(dic)
print(output)
Example #56
0
 # for col in train_num.columns:
 #       train_num[col].plot.hist(title = col)
 #       s = train_num.describe()[col].to_string() + \
 #           "\nMissing Values: " + str(train_num.isnull().sum()[col]) + \
 #           "\nMissing Values %: " + str(round(train_num.isnull().sum()[col]/len(train_num),4))
 #       plt.figtext(1, 0.5, s)
 #       plt.show()
 
 droped_ttest_cols = []         
 # * Evaluar normalidad "skewness"
 target = train_temp[label]
 t_sel = [0] * len(train_num.columns) # señala qué variables pueden ayudar a predecir target
 t_ctr = 0 # contador
 for col in train_num.columns:
     # Shapiro-Wilk test
     stat, p = shapiro(train_num[col])
     #print('Statistics={:.3f}, p={:.3f}'.format(stat, p))
     
     if p > 0.05: # no se rechaza la H0 según la cual la distribución de estos datos es similar a la gaussiana
         # t-test
         # print(col)
         # separación de datos según la aceptación del crédito
         t0 = train_num[col][target == 0]
         t1 = train_num[col][target == 1]
         stat, p = ttest_ind(t0, t1, nan_policy = "omit", equal_var = False)
         # print('T-statistic={:.3f}, p={:.3f}'.format(stat, p))
         
         if p < 0.05: # se rechaza la H0 según la cual las medias de t0 y t1 no difieren significativamente
             t_sel[t_ctr] = 1
         else:
             droped_ttest_cols.append(col)
Example #57
0
 plt.figure()
 plt.plot(x, gNorm, 'r-', label='Norm PDF')
 plt.plot(x, sNorm, 'g-', label='Skewed Norm PDF')
 #plt.plot(x, chiSq,'m-', label='Chi-Square PDF')
 plt.bar(bin_edges[:-1],
         hist,
         width=(max(bin_edges) - min(bin_edges)) / iters)
 plt.title('NRMSE distribution')
 plt.xlim(np.min(x), np.max(x))
 plt.legend()
 # Print Normality Test results (Variable and p-value)
 #   Kolmogorov-Smirnov
 print(stats.kstest(y, 'norm', args=(mean, var)))
 print(stats.kstest(y, 'skewnorm', args=(smean, svar, sk)))
 #   Shapiro-Wilk
 print(stats.shapiro(y))
 #   Chi-Square
 #print(stats.chisquare(hist, gNorm))
 #print(stats.chisquare(hist, sNorm))
 # Q-Q plots
 #f, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2)
 f, (ax1, ax2) = plt.subplots(1, 2)
 plt.title("Q-Q Plots")
 res = stats.probplot(y, dist=stats.norm(mean, var), plot=ax1)
 ax1.set_title("Normality Test (Non-Skewed)")
 resS = stats.probplot(y, dist=stats.skewnorm(smean, svar, sk), plot=ax2)
 ax2.set_title("Normality Test (Skewed)")
 #resX2 = stats.probplot(y, dist=stats.chi2(4), plot=ax3)
 #ax3.set_title("Chi-Square Test (k=4)")
 #resX2 = stats.probplot(y, dist=stats.chi2(10), plot=ax4)
 #ax4.set_title("Chi-Square Test (k=10)")
Example #58
0
#bins = 10

print("Bins No (Sturge’s Rule): ", bins)
plt.hist(scores, bins=bins)
plt.ylabel('Probability')
plt.xlabel("Accuracy")
plt.title("Accuracy of " + model_label + " with CV=" + str(cv))
plt.show()
qqplot(scores, line='s')
plt.title("Accuracy of " + model_label + " with CV=" + str(cv))
plt.show()

alpha = 0.05

print("Shapiro-Wilk Test result:")
stat, p = shapiro(scores)
print('     Statistics=%.3f, p=%.3f, alpha=%.3f' % (stat, p, alpha))
if p > alpha:
    print('     Sample looks Gaussian (fail to reject H0)')
else:
    print('     Sample does not look Gaussian (reject H0)')

print("D’Agostino’s K^2 Test result:")
stat, p = normaltest(scores)
print('     Statistics=%.3f, p=%.3f, alpha=%.3f' % (stat, p, alpha))
if p > alpha:
    print('     Sample looks Gaussian (fail to reject H0)')
else:
    print('     Sample does not look Gaussian (reject H0)')

print("Anderson-Darling Test result:")
# Example of the Shapiro-Wilk Normality Test
from scipy.stats import shapiro
data = [
    13.83, 14.47, 14.03, 15.46, 15.61, 13.6, 15.26, 14.13, 14.41, 13.7, 14.23,
    14.49, 14.0, 13.73, 13.92, 13.82, 13.81, 13.88, 13.71, 14.08, 14.1, 13.38,
    13.69, 13.56, 13.57, 13.63, 13.59, 13.64, 13.97, 13.29, 13.72
]
stat, p = shapiro(data)
print('stat=%.3f, p=%.3f' % (stat, p))
f = open("NormalTestShapiro-Wilk.txt", "a")

if p > 0.05:
    print('Probably Gaussian')
    f.write('Probably Gaussian\n')
else:
    print('Probably not Gaussian')
    f.write('Probably not Gaussian\n')

f.write("Stat: {0}s and  p: {1}s\n".format(stat, p))

f.close()
Example #60
0
 def get_statistic_and_pvalue(self, y):
     return shapiro(y)