Example #1
0
def get_stats(df1, df2):
    """
    Calculates whether t-test or u-test must be applied and its result.
    Keyword arguments\\:
          * df1 -- pandas DataFrame1
          * df2 -- pandas DataFrame2
    """
    # try:
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    if not df1.equals(df2):
        try:
            # perform Anderson-Darling test to check if data comes from normal distribution
            sta1, crit1, sig1 = stats.anderson(df1)
            sta2, crit2, sig2 = stats.anderson(df2)

            if (sta1 <= crit1.item(4)) and (sta2 <= crit2.item(4)):
                # perform independent t-test for normal distributed data (alpha=0.01)
                return stats.ttest_ind(df1, df2)
            else:
                try:
                    # perform Mann-Whitney U test for non-normal distributed data
                    return stats.mannwhitneyu(df1,
                                              df2,
                                              alternative='two-sided')
                except ValueError:
                    return np.nan, np.nan
        except FloatingPointError:
            return stats.mannwhitneyu(df1, df2, alternative='two-sided')
    else:
        return np.nan, np.nan
Example #2
0
def test_anderson(dfi:pd.Series, dist_type:"1 == Normal, 0 == Non-Normal", sample_sizes = [30,50,100,150,200,500]):
    results = {}
    for ss in sample_sizes:
        anderson_statistic_list = []
        results.update({f"anderson_critical_{ss}":anderson(dfi.sample(ss))[1][4]})
        for i in range(1,1000):
            anderson_statistic_list.append(anderson(dfi.sample(ss))[0])
        results.update({f"anderson_statistic_{ss}":anderson_statistic_list})
    for ss in sample_sizes:
        sns.distplot(results[f"anderson_statistic_{ss}"])
        plt.axvline(results[f"anderson_critical_{ss}"])
        plt.title(f"Anderson-Darling Test Results")
    plt.show()
    sns.distplot(dfi)
    plt.title("Original Data Distribution")
    for ss in sample_sizes:
        data = results[f"anderson_statistic_{ss}"]
        critical = results[f"anderson_critical_{ss}"]
        data = pd.DataFrame(data, columns=["test_statistic"])
        if dist_type == 1:
            pct_critical = len(data[data.test_statistic > critical]) / len(data)
            print(f"Sample size {ss}: {pct_critical*100}% of data misclassified non-normal\r")
        if dist_type == 0:
            pct_critical = len(data[data.test_statistic < critical]) / len(data)
            print(f"Sample size {ss}: {pct_critical*100}% of data misclassified normal\r")
Example #3
0
def normal_test_ad():
    abbot = pd.read_csv('ABBOTINDIA.NS.csv')
    mrf = pd.read_csv('MRF.NS.csv')
    shreecem = pd.read_csv('SHREECEM.NS.csv')
    
    result = anderson(abbot["Close"])
    print('Statistic: %.3f' % result.statistic)
    p = 0
    for i in range(len(result.critical_values)):
	    sl, cv = result.significance_level[i], result.critical_values[i]
	    if result.statistic < result.critical_values[i]:
		    print('%.3f: %.3f, data looks normal :: Anderson-Darling Test :: Abbot' % (sl, cv))
	    else:
		    print('%.3f: %.3f, data does not look normal :: Anderson-Darling Test :: Abbot' % (sl, cv))
    
    result = anderson(mrf["Close"])
    print('Statistic: %.3f' % result.statistic)
    p = 0
    for i in range(len(result.critical_values)):
	    sl, cv = result.significance_level[i], result.critical_values[i]
	    if result.statistic < result.critical_values[i]:
		    print('%.3f: %.3f, data looks normal :: Anderson-Darling Test  :: MRF' % (sl, cv))
	    else:
		    print('%.3f: %.3f, data does not look normal :: Anderson-Darling Test :: MRF' % (sl, cv))
    
    result = anderson(shreecem["Close"])
    print('Statistic: %.3f' % result.statistic)
    p = 0
    for i in range(len(result.critical_values)):
	    sl, cv = result.significance_level[i], result.critical_values[i]
	    if result.statistic < result.critical_values[i]:
		    print('%.3f: %.3f, data looks normal :: Anderson-Darling Test :: Shree Cements' % (sl, cv))
	    else:
		    print('%.3f: %.3f, data does not look normal :: Anderson-Darling Test :: Shree Cements' % (sl, cv))
    time.sleep(5)
Example #4
0
 def test_normal(self):
     rs = RandomState(1234567890)
     x1 = rs.standard_exponential(size=50)
     x2 = rs.standard_normal(size=50)
     A, crit, sig = stats.anderson(x1)
     assert_array_less(crit[:-1], A)
     A, crit, sig = stats.anderson(x2)
     assert_array_less(A, crit[-2:])
Example #5
0
 def test_normal(self):
     rs = RandomState(1234567890)
     x1 = rs.standard_exponential(size=50)
     x2 = rs.standard_normal(size=50)
     A,crit,sig = stats.anderson(x1)
     assert_array_less(crit[:-1], A)
     A,crit,sig = stats.anderson(x2)
     assert_array_less(A, crit[-2:])
    def norm_cal(self, x):
        '''Calculate the normality of a single variable x. 
        
        Parameters:
        ----------
        x : numpy.ndarray
        
        Returns:
        -------
        x_res : dict
            'Statistic': statistic value calculated by the test
            'Pvalue':  p-value calculated by the test
            'Critical': critical value if Anderson-Darling is used
            'Test': name of the test used
            'Sample size': sample size of the variable 
            'Result': bool, True if p-value < .5, False otherwise
        
        Notes:
        -----
        More conservative cutoff numbers of 3500 and 50 are chosen 
        based on below test conventions:
        Jarque_bera requires 2000+ samples;
        Shapiro-Wilk is accurate under 5000;
        And common difinition of small sample size is 30'''

        x_res = {}
        if len(x) >= 3500:  # Use Jarque_bera for samples larger 3500
            x_res['Statistic'] = ss.jarque_bera(x)[0]
            x_res['Pvalue'] = ss.jarque_bera(x)[1]
            x_res['Test'] = 'Jarque Bera Test'
            x_res['Sample Size'] = x.shape
        elif len(x) >= 50:  # Use Shapiro-Wilk for samples  [50  3500)
            x_res['Statistic'] = ss.shapiro(x)[0]
            x_res['Pvalue'] = ss.shapiro(x)[1]
            x_res['Test'] = 'Shapiro-Wilk Test'
            x_res['Sample Size'] = x.shape
        else:  # Use Anderson-Darling for samples less than 50
            x_res['Statistic'] = ss.anderson(x)[0][2]
            x_res['Critical'] = ss.anderson(x)[1][2]
            x_res['Test'] = 'Aderson-Darling Test'
            x_res['Sample Size'] = x.shape

        if x_res['Test'] != 'Aderson-Darling Test':
            if x_res['Pvalue'] < .05:  # Fixed significance level
                x_res['Result'] = False
            else:
                x_res['Result'] = True
        else:  # Anderson-Darling result has to be specially handled
            if x_res['Critical'] < x_res['Statistic']:
                x_res['Result'] = False
            else:
                x_res['Result'] = True

        return x_res
Example #7
0
 def test_expon(self):
     rs = RandomState(1234567890)
     x1 = rs.standard_exponential(size=50)
     x2 = rs.standard_normal(size=50)
     A,crit,sig = stats.anderson(x1,'expon')
     assert_array_less(A, crit[-2:])
     olderr = np.seterr(all='ignore')
     try:
         A,crit,sig = stats.anderson(x2,'expon')
     finally:
         np.seterr(**olderr)
     assert_(A > crit[-1])
Example #8
0
 def test_expon(self):
     rs = RandomState(1234567890)
     x1 = rs.standard_exponential(size=50)
     x2 = rs.standard_normal(size=50)
     A, crit, sig = stats.anderson(x1, 'expon')
     assert_array_less(A, crit[-2:])
     olderr = np.seterr(all='ignore')
     try:
         A, crit, sig = stats.anderson(x2, 'expon')
     finally:
         np.seterr(**olderr)
     assert_(A > crit[-1])
Example #9
0
    def fit(self, X_train):
        """
        对数值列进行画像
        :param X_train: 待质量画像的数据,要求输入data全部为数值列
        :return: 数据质量画像,最大值、最小值、平均数、方差、中位数、四分位数、正太分布相似度
        """
        handling_list = self.continuous_columns_list + self.discrete_columns_list
        if not handling_list:
            warnings.warn("No feature supplied to fix outliers.")
            empty_df = DataFrame()
            self.model = empty_df
            return self

        numeric_list = X_train.select_dtypes(
            include=np.number).columns.tolist()
        if set(handling_list) > set(numeric_list):
            raise RuntimeError(
                'Features that to be handled must be numeric type.')

        describe_df = X_train[handling_list].describe().T

        # 循环处理各数值字段
        for column in describe_df.index:
            column_data = X_train[column]

            # 正态分布相似度计算(使用显著水平为15%计算)
            # 参考:https://blog.csdn.net/qq_20207459/article/details/102863982
            # https://www.zhihu.com/question/263864019
            anderson_result = stats.anderson(column_data, 'norm')
            describe_df.loc[
                column,
                'normal'] = anderson_result.statistic < anderson_result.critical_values[
                    0]

            describe_df.loc[column, 'lognormal'] = False
            # 取log后,正态分布相似度计算(使用显著水平为15%计算)
            # 参考同上
            if describe_df.loc[column, 'min'] > 0:
                anderson_log_res = stats.anderson(np.log(column_data), 'norm')
                describe_df.loc[
                    column,
                    'lognormal'] = anderson_log_res.statistic < anderson_log_res.critical_values[
                        0]

        _ = describe_df.pop('count')
        describe_df['col_name'] = describe_df.index
        self.model = describe_df

        return self
Example #10
0
def check_cluster(cluster):
    n = len(cluster)
    if n < 2:
        return True, []

    # Run k_means on two centers
    children, labels, _ = k_means(cluster, 2)

    # Let v = c1 - c2 be a d-dimensional vector that connects the two centers. This is the direction that k-means
    # believes to be important for clustering.
    v = children[1]-children[0]

    # Then project X onto v: x'i = hxi, vi/||v||2. X0 is a 1-dimensional
    # representation of the data projected onto v.
    x_prime = [np.dot(point, v) for point in cluster]

    # Transform X0 so that it has mean 0 and variance 1.
    x_prime = zscore(x_prime)

    # Let zi = F(x0(i)). If A2*(Z) is in the range of non-critical values at confidence level alpha, then accept H0,
    # keep the original center, and discard {c1, c2}. Otherwise, reject H0 and keep {c1, c2} in place of the original
    # center.
    a2, critical, sig = anderson(x_prime)
    a2 *= (1+4.0/n-25.0/(n**2))

    return a2 < critical[0], children
Example #11
0
    def get_anderson(self):
        """
            Anderson-Darling test for data coming from a particular
            distribution.

            Returns:
                tuple: statistic value, critical values and significance values.

            Note:
                Need scipy.stats module to perform Anderson-Darling test.
        """

        try:
            from scipy import stats
        except ImportError:
            raise ImportError("Need 'scipy.stats' module to calculate "
                              "anderson-darling test.")

        error = (self.expected_targets - self.predicted_targets).flatten()

        # from matplotlib import pyplot as plt
        # import matplotlib.mlab as mlab
        #
        # plt.figure(figsize=(24.0, 12.0))
        # _, bins, _ = plt.hist(error, 50, normed=1)
        # _mu = np.mean(error)
        # _sigma = np.std(error)
        # plt.plot(bins, mlab.normpdf(bins, _mu, _sigma))
        # plt.show()
        # plt.close()

        # Calculate Anderson-Darling normality test index
        ad_statistic, ad_c, ad_s = stats.anderson(error, "norm")

        return ad_statistic, ad_c, ad_s
Example #12
0
def computeAD(data,mu,sd,seed):
    np.random.seed(seed)
    from skgof import ad_test
    from scipy.stats import norm, anderson
    res = ad_test(data, norm(loc=mu,scale=sd))
    res2 = anderson(data, 'norm')
    return [res.statistic, res.pvalue,res2.critical_values.tolist()]
Example #13
0
 def distribution(self,par,lar,lon):
     """ This method produces an histogram of the data serie and computes the value
     of the Anderson-Darling normality test (the threshold to determine the normality
     depends on the number of samples: see online documentation) """
     fig1 = plt.figure(1)
     fig1.canvas.manager.window.resizable(int(lar/2), int(lon/2))
     fig1.canvas.manager.window.wm_geometry("+0+0") 
     ax = fig1.add_subplot(111)
     lis=list(self.df[self.parameter].dropna())
     a = stats.anderson(lis)[0]
     anderson = round(a*a,2)
     mu = np.mean(lis) # Mean 
     sigma = np.std(lis) # Standard Deviation
     n, bins, patches = ax.hist(lis, facecolor='grey', alpha=0.5) # Histogram of the data
     y = mlab.normpdf(bins, mu, sigma) # Add a best fit line
     props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)        
     plt.text(0.05, 0.95, 'Anderson-Darling normality test ($A^{2}$): '+str(anderson)+'\nIf $A^{2}$>0.752 hypothesis of normality rejected at 95% of significance level', transform=ax.transAxes, fontsize=12, verticalalignment='top', bbox=props) # Print the result of the test        
     ax.ticklabel_format(useOffset=False)
     plt.plot(bins, y, 'r--')
     plt.xlabel('Parameter (values)',fontsize=15)
     plt.xticks(fontsize=15)
     plt.ylabel('Number of samples',fontsize=15)
     plt.yticks(fontsize=15)
     plt.title(str(par),fontsize=20)
     plt.subplots_adjust(left=0.15) # Tweak spacing to prevent clipping of ylabel
     fig1.show()
Example #14
0
def normality_check(data_A, data_B, name, alpha):

    if (name == "Shapiro-Wilk"):
        # Shapiro-Wilk: Perform the Shapiro-Wilk test for normality.
        shapiro_results = stats.shapiro(
            [a - b for a, b in zip(data_A, data_B)])
        return shapiro_results[1]

    elif (name == "Anderson-Darling"):
        # Anderson-Darling: Anderson-Darling test for data coming from a particular distribution
        anderson_results = stats.anderson(
            [a - b for a, b in zip(data_A, data_B)], 'norm')
        sig_level = 2
        if (float(alpha) <= 0.01):
            sig_level = 4
        elif (float(alpha) > 0.01 and float(alpha) <= 0.025):
            sig_level = 3
        elif (float(alpha) > 0.025 and float(alpha) <= 0.05):
            sig_level = 2
        elif (float(alpha) > 0.05 and float(alpha) <= 0.1):
            sig_level = 1
        else:
            sig_level = 0

        return anderson_results[1][sig_level]

    else:
        # Kolmogorov-Smirnov: Perform the Kolmogorov-Smirnov test for goodness of fit.
        ks_results = stats.kstest([a - b for a, b in zip(data_A, data_B)],
                                  'norm')
        return ks_results[1]
Example #15
0
def nuevo_regress():
    modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit()
    summary = modelo.summary()
    vals_residuales = modelo.resid
    print(summary)
    print(anderson(vals_residuales))
    grafica_qq(vals_residuales)
def execute(init_param): 
    """ execute is the function where each run is done. main sets parameters then calls execute"""
    
    training_data, cv_data, test_data = form_data(init_param)
    
    if init_param.output:
        output(training_data, cv_data)
    
    clf, regularization_parameter = learn(training_data, cv_data)
    
    
    """ do an Anderson Darling test on the data to determine if it is a normal fit"""
    A2, sig, crit = anderson(test_data.y, dist = 'norm')
    print("the value for A2 is ", A2)
    
    mn = np.mean(test_data.y)
    sd = np.std(test_data.y)
    print("The mean and standard deviation of the test data are ", mn, sd)
    
    
    predict_data = clf.predict(test_data.X)
    difference = predict_data - test_data.y
    mn = np.mean(difference)
    sd = np.std(difference)
    print("The mean and standard deviation of the difference are ", mn, sd)
    
    print("run finished")
Example #17
0
def cluster(r):

   d,n = shape(r)
   # If the dataset is only one point, return a large number to                                                                       
   # indicate that we have over-fit this cluster                                                                                    
   if d<=1:
       return -1,0,0

   # Run k-means with k=2                                                                                                           
   c=2
   centroids,labels = kmeans2(r,c,minit='points')
   # Find the axis along which the two centroids lie                                                                               
   v = centroids[1,:] - centroids[0,:]
   # Get the projection of each point in r along v                                                                                 
   x = empty((n))
   
   x = dot(r,v)/linalg.norm(v)

   # Mean subtraction and normalization                                                                                           
   mu = mean(x)
   y = (x-mu)/std(x)
   y = sort(y)
   
   KS_stat,KS_pval = kstest(y,'norm')   
   And_stat,_,__ = anderson(y,'norm')
   
   return KS_pval,And_stat,centroids,labels
Example #18
0
def calc_basic_stats(t, mag, err):

    N = np.size(mag)

    # basic stats
    median = np.median(mag)
    w = err**-2
    wmean, wstd = calc_weighted_mean_std(mag, w)
    chi2red = np.sum((wmean - mag)**2 * w) / (N - 1)
    RoMS = np.sum(abs(mag - median) / err) / (N - 1)

    # deviation from median
    NormPeaktoPeakamp = calc_NormPeaktoPeakamp(mag, err)
    NormExcessVar = calc_NormExcessVar(mag, err, N, wmean)
    medianAbsDev = np.median(abs(mag - median))
    iqr = np.diff(np.percentile(mag, q=[25, 75]))[0]
    i60r = np.diff(np.percentile(mag, q=[20, 80]))[0]
    i70r = np.diff(np.percentile(mag, q=[15, 85]))[0]
    i80r = np.diff(np.percentile(mag, q=[10, 90]))[0]
    i90r = np.diff(np.percentile(mag, q=[5, 95]))[0]

    # other variability stats
    skew = 1. * N / (N - 1) / (N - 2) * np.sum(((mag - wmean) / err)**3)
    smallkurt = calc_smallkurt(mag, err, N, wmean)
    invNeumann = calc_invNeumann(t, mag, wstd)
    WelchI, StetsonJ, StetsonK = calc_Stetson(mag, err, N, wmean)
    AD = anderson(mag / err)[0]
    SW = shapiro(mag / err)[0]

    return np.r_[N, median, wmean, chi2red, RoMS, wstd, NormPeaktoPeakamp,
                 NormExcessVar, medianAbsDev, iqr, i60r, i70r, i80r, i90r,
                 skew, smallkurt, invNeumann, WelchI, StetsonJ, StetsonK, AD,
                 SW]
Example #19
0
def verify_normality(title, data):
    print("\nRESULTS\n")
    # Shapiro-Wilk test
    # Tests if sample came from a normal dist
    shapiroResult = stats.shapiro(data)
    print(shapiroResult)

    # Anderson-Darling test
    # Tests if data comes from a particular distribution (normal in this case)
    # Not sure how this works yet...
    andersonResult = stats.anderson(data, dist="norm")
    print(andersonResult)

    # Looks useful but haven't seen the documentation to know what it actually calculates
    print(str(stats.normaltest(values)))

    print("\nNull hypothesis : 'Data is normally distributed'.")
    if shapiroResult[1] < 0.05:
        print("Shapiro-Wilk Test: Null Rejected due to p-value < 0.05" + " (" + str(
            shapiroResult[1]) + ")" + ". Sample does NOT come from a normal distribution.")
        return False
    else:
        print(
            "Shapiro-Wilk Test: Null Accepted. Failed to reject null hypothesis. Sample possible comes from a normal "
            "distribution.")
        return True
Example #20
0
def calc_p_matrix():
    sheet_name_vec = ['Indoor', 'Outdoor']
    sub_num = 7
    is_normal_list = [True, True]
    p_matrix = np.ones((2, 6, 6))
    for i in range(2):
        dfs = pd.read_excel("classification accuracy.xlsx",
                            sheet_name=sheet_name_vec[i])
        data = dfs.values[:, 2:sub_num + 2].astype(np.float)
        data_list = []
        for m in range(6):
            data_m = data[5 * m:5 * (m + 1), :].reshape(-1)
            if is_normal_list[i]:
                statistic, critical_values, significance_level = stats.anderson(
                    data_m)
                if statistic > critical_values[2]:
                    is_normal_list[i] = False
            data_list.append(data_m)
        for r in range(6):
            for c in range(6):
                if is_normal_list[i]:
                    _, p_matrix[i, r,
                                c] = stats.ttest_ind(data_list[r],
                                                     data_list[c])
                else:
                    _, p_matrix[i, r,
                                c] = stats.ranksums(data_list[r], data_list[c])
    return p_matrix, is_normal_list
Example #21
0
def plot(data):
    n, bins, patches = plt.hist(np.array(data), 50)
    mu = np.mean(data)
    sigma = np.std(data)
    print("Mean: {}, std: {}".format(mu, sigma))
    # Shapiro test
    stat, p = shapiro(np.array(data))
    if p > 0.05:
        print("Shapiro: Data is normally distributed")
    else:
        print("Shapiro: Data is NOT normally distributed, p-value: {}".format(p))
    stat, p = normaltest(np.array(data))
    if p > 0.05:
        print("D'Agostino: Data is normally distributed")
    else:
        print("D'Agostino: Data is NOT normally distributed, p-value: {}".format(p))
    result = anderson(np.array(data))
    p = 0
    for i in range(len(result.critical_values)):
	    sl, cv = result.significance_level[i], result.critical_values[i]
	    if result.statistic < result.critical_values[i]:
		    print('Anderson: %.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
	    else:
		    print('Anderson: %.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))
    plt.plot(bins, norm.pdf(bins, mu, sigma))
    qqplot(np.array(data), line='s')
    plt.show()
Example #22
0
def EnKF_Step(forecast, ensemble, obs, obs_var, n, inflation=1.0, H=None, adthreshold=0.05):
    
    for i in range(n):
        #forecast ensemble
        ensemble[i] = forecast(ensemble[i])
    #check if prior is Gaussian with Anderson-Darling
    stat, crit_vals, sig_lvls = anderson(ensemble, 'norm')
    #5% is the third critical value
    if stat >= crit_vals[2]:
        print("Ensemble fails Anderson-Darling test (is rejected as Normal) with test statistic %f and critical value %f" % (stat, crit_vals[2]))
        failed=True
    else:
        failed=False
    if not inflation == 1.0:
        ensemble *= np.sqrt(inflation)
    #covariance
    amean = np.mean(ensemble) #analysis mean

    C = np.sum((ensemble - amean) ** 2) / (n-1)

    
    #update ensemble
    if H is None:

        K = C / (C + obs_var)

        ensemble += K * (obs + draw_ensemble(0.0, obs_var, n) - ensemble)

        C -= K * C

        amean = np.mean(ensemble) #analysis mean
        aspread = np.sqrt(C)


    return amean, aspread, ensemble, failed
Example #23
0
def main():
    # read the data
    data = pd.read_csv("CC GENERAL.csv")
    data.loc[(data['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].median()
    data.loc[(data['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = data['CREDIT_LIMIT'].median()
    data = data.drop(['CUST_ID'], 1)

    names = data.columns.tolist()

    # normalize the data
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    data_scaled = pd.DataFrame(data_scaled, columns=names)

    # apply Jurque-Bera Test
    print("Jurque-Bera Test:")
    for i in range(len(names)):
        X = data_scaled[names[i]]
        jb_value, p_value = jarque_bera(X)
        print("{} for {} feature, test value is {} and p-value is {}".format(i+1, names[i], jb_value, p_value))

    print("\n")

    # apply Anderson Test
    print("Anderson Test:")
    for i in range(len(names)):
        X = data_scaled[names[i]]
        a = anderson(X, dist='norm')
        print("for {} feature, test value is {}".format(names[i], a))
    print("\n")
Example #24
0
def normality_of_residuals_test(model):
    '''
    Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to 
    investigate the normality of residuals.
    
    Arg:
    * model - fitted OLS models from statsmodels
    '''

    sm.ProbPlot(model.resid).qqplot(line='s')
    plt.title('Q-Q Plot')

    jb = stats.jarque_bera(model.resid)
    sw = stats.shapiro(model.resid)
    ad = stats.anderson(model.resid, dist='norm')
    ks = stats.kstest(model.resid, 'norm')

    print(f'Jarque_Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}')
    print(
        f'Shapiro_Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}')
    print(
        f'Kolmogorov_Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}'
    )
    print(
        f'Anderson_Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}'
    )
Example #25
0
    def normality_tests(self, data, test_type):
        # Tests whether a data sample has a Gaussian distribution.
        # H0: the sample has a Gaussian distribution.
        # H1: the sample does not have a Gaussian distribution
        if test_type == 'ShapiroWilk':
            stat, p = shapiro(data)
            if p > 0.05:
                print('Probably Gaussian')
            else:
                print('Probably not Gaussian')

        elif test_type == 'DAgostino':
            stat, p = normaltest(data)
            if p > 0.05:
                print('Probably Gaussian')
            else:
                print('Probably not Gaussian')

        elif test_type == 'AndersonDarling':
            result = anderson(data)
            for i in range(len(result.critical_values)):
                sl, cv = result.significance_level[i], result.critical_values[
                    i]
                if result.statistic < cv:
                    print('Probably Gaussian at the %.1f%% level' % (sl))
                else:
                    print('Probably not Gaussian at the %.1f%% level' % (sl))
Example #26
0
    def consistent_gaussian(self, radius = np.nan):
        """Establish whether self.image appears consistent with Gaussian noise.

        If a radius is provided, return True if either the whole image, or the region within
        radius arcsec of the centre, is consistent with Gaussian noise. The idea here is that
        checking only the central region reduces the likelihood of other sources influencing
        the result, but reducing the number of pixels also increases the significance of any
        bright pixels, so it's useful to check both regions."""

        if not np.isnan(radius):
            sky_separation = self._projected_sep_array([i/2 for i in self.image.shape])
            data = self.image[sky_separation < radius]

        else:
            data = self.image.flatten()

        #perform an Anderson-Darling test for normality
        result = anderson(data)

        #according to scipy documentation, [2] should correspond to the 5% level,
        #however this function is set up to account for possible future changes
        sig = result.significance_level[2]
        crit = result.critical_values[2]

        #return the significance level and whether the data are consistent with a Gaussian at that level
        if np.isnan(radius):
            return sig, (result.statistic < crit)
        else:
            return sig, (result.statistic < crit or self.consistent_gaussian(np.nan)[1])
Example #27
0
    def work(self, input_items, output_items):

        arquivo_detec = open('/home/joab/Projeto Sense/detec.txt', 'a')
        arquivo_sense = open('/home/joab/Projeto Sense/sense.txt', 'a')
        arquivo_teste = open('/home/joab/Projeto Sense/teste.txt', 'a')

        AD = anderson(input_items[0], dist='norm')

        for i in range(len(input_items[0])):
            #a=p * input_items[0][i]

            if AD[0] > input_items[0][i]:

                #vetor_dados_sense = [input_items[0][i], a, JB, i, freq]
                vetor_dados_detec = [i, input_items[0][i], AD[0], self.freq]
                #arquivo_sense.write('%s' %vetor_dados_sense)
                arquivo_detec.write('%s' % vetor_dados_detec)
                #arquivo_sense.write('\n')
                arquivo_detec.write('\n')

        arquivo_sense.close()
        arquivo_detec.close()
        arquivo_teste.close()

        return len(output_items[0])
Example #28
0
def estadistica_inferencial():
    muestra_periodo_25 = obtener_muestra(DATASET, 'año_semestre', 20182,
                                         0.25)  # Puntaje_global
    media_1, desviacion_estandar_1, longitud_muestra_1 = resumen(
        muestra_periodo_25, 'puntaje_global')
    print('Se rechaza H_0: {}'.format(
        Z_c(media_1, desviacion_estandar_1, 280, longitud_muestra_1) < -Z_alfa)
          )
    print('\n{}'.format('*' * 100))

    muestra_periodo_30 = obtener_muestra(DATASET, 'año_semestre', 20182,
                                         0.3)  # Puntaje_naturales
    media_naturales_20182 = obtener_media(
        filtro_periodo(20182)['puntaje_naturales'])
    media_2, desviacion_estandar_2, longitud_muestra_2 = resumen(
        muestra_periodo_30, 'puntaje_naturales')

    print('Instituciones debajo de la media: {}'.format(
        len(muestra_periodo_30[
            muestra_periodo_30['puntaje_naturales'] < media_naturales_20182])))
    print('Se rechaza H_0: {}'.format(
        Z_c2(media_2, 0.15, longitud_muestra_2) > Z_alfa))
    print('\n{}'.format('*' * 100))

    modelo = modelo_regresion()
    valores_residuales = modelo.resid
    print('Nivel de significancia Anderson Darling {}'.format(
        anderson(valores_residuales)[2]))
    print('\n{}'.format('*' * 100))
    print(modelo.summary())
    print('\n{}'.format('*' * 100))

    ecuaRecta(DATASET, 'puntaje_matematicas', DATASET, 'puntaje_global')
    grafica_qq(valores_residuales)
    def get_anderson(self):
        """
            Anderson-Darling test for data coming from a particular
            distribution.

            Returns:
                tuple: statistic value, critical values and significance values.

            Note:
                Need scipy.stats module to perform Anderson-Darling test.
        """

        try:
            from scipy import stats
        except ImportError:
             raise ImportError("Need 'scipy.stats' module to calculate "
                              "anderson-darling test.")

        error = (self.expected_targets - self.predicted_targets).flatten()

        # from matplotlib import pyplot as plt
        # import matplotlib.mlab as mlab
        #
        # plt.figure(figsize=(24.0, 12.0))
        # _, bins, _ = plt.hist(error, 50, normed=1)
        # _mu = np.mean(error)
        # _sigma = np.std(error)
        # plt.plot(bins, mlab.normpdf(bins, _mu, _sigma))
        # plt.show()
        # plt.close()

        # Calculate Anderson-Darling normality test index
        ad_statistic, ad_c, ad_s = stats.anderson(error, "norm")

        return ad_statistic, ad_c, ad_s
Example #30
0
def se_distribuicao_normal(tx_values, len_min=48):
    """
    Verifica se uma dada distribuição de valores é normal.
    :param tx_values: Os valores contidos na distribuição.
    :param len_min: O tamanho mínimo para que seja possível checar a normalidade da amostra.
    :return: True, caso a distribuição seja normal, de acordo com os testes Anderson-Darling e Shapiro-Wilk, realizando
        este último apenas se o tamanho da amostra for menor que 5000, e False, caso o tamanho da distribuição seja
        menor que len_min, ou caso um dos dois testes citados (quando for possível empregar os dois testes) indique não
        normalidade.
    """
    len_amostra = len(tx_values)
    retorno = False

    if (len_amostra < len_min):
        print('Amostra muito pequena: tamanho = ', len_amostra)
    else:
        result = stats.anderson(tx_values)
        if result.statistic < result.critical_values[2]:
            # If the returned statistic is larger than these critical values then for the corresponding significance level,
            # the null hypothesis that the data come from the chosen distribution can be rejected.
            retorno = True  # normal

        if (retorno and len_amostra < 5000):
            stat, p_valor = stats.shapiro(tx_values)
            # Rejeitar H0 ao nível de significância α se Wcalculado < Wα
            retorno = retorno and p_valor > 0.05

    return retorno
Example #31
0
def normality_of_residuals_test(model):
    '''
    Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to
    investigate the normality of residuals.

    Arg:
    * model - fitted OLS models from statsmodels
    '''
    sm.ProbPlot(model.resid).qqplot(line='s')
    plt.title('Q-Q plot')

    jb = stats.jarque_bera(model.resid)
    sw = stats.shapiro(model.resid)
    ad = stats.anderson(model.resid, dist='norm')
    ks = stats.kstest(model.resid, 'norm')

    print(f'Jarque-Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}')
    print(
        f'Shapiro-Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}')
    print(
        f'Kolmogorov-Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}'
    )
    print(
        f'Anderson-Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}'
    )
    print(
        'If the returned AD statistic is larger than the critical value, then for the 5% significance level, the null hypothesis that the data come from the Normal distribution should be rejected. '
    )
Example #32
0
def residualanalysis(data_resid, label='Residuals'):
    print(label)
    #Augmented Dickey-Fuller unit root test
    result = teststationarity(data_resid.values, regression='c')

    fig = plt.figure()
    ax = fig.add_subplot(211)
    ax.plot(data_resid)
    plt.title(label)

    ax = fig.add_subplot(212)
    res = stats.probplot(data_resid.values, plot=ax)
    plt.show()

    #k2,p=stats.normaltest(data_resid)

    # Anderson darling test
    print('Anderson Darling test for normality')
    r = stats.anderson(data_resid)
    i = 2  #significan level of 0.05
    if r.statistic > r.critical_values[i]:
        print('Statistic: %.3f p<%.2f' %
              (r.statistic, r.significance_level[i] / 100))
        print('Fail to reject H0, data is normal')
    else:
        print('Statistic: %.3f p>%.2f' %
              (r.statistic, r.significance_level[i] / 100))
        print('Can reject H0, data is not normal')
Example #33
0
def anderson_darling_test(dataset: np.ndarray):
    """
    Refer to pages 228-229 of lecture-2-3_Ch1_MFIT5003_Fall2020-21_with_MJD.pdf

    anderson-darling test does not provide a concrete p value
    """
    assert np.isnan(dataset).any() == False
    assert len(dataset) > 7

    stat, critical_values, significance_levels = anderson(x=dataset,
                                                          dist='norm')
    print(
        'Results from Anderson-Darling test:\nstats: {}\ncritical_values: {}\nsignificance_levels: {}'
        .format(stat, critical_values, significance_levels))

    print('Results interpretation:')
    for i in range(len(critical_values)):
        if stat > critical_values[i]:
            print(
                'sample does NOT look Gaussian (reject H0) (p < alpha == {})'.
                format(significance_levels[i] / 100))
        else:
            print(
                'sample looks Gaussian (fail to reject H0) (p > alpha == {})'.
                format(significance_levels[i] / 100))
def normaltest(x):
    # kstest(K-S检验)
    K_s, K_p = stats.kstest(x, 'norm')
    print("kstest(K-S检验)", K_s, K_p)

    # Shapiro-Wilk test
    S_s, S_p = stats.shapiro(x)
    print('the shapiro test ', S_s, ',', S_p)

    # normaltest
    N_s, N_p = stats.normaltest(x)
    print('normaltest', N_s, N_p)

    # Anderson-Darling test
    A_s, A_c, A_sl = stats.anderson(x, dist='norm')
    print('Anderson-Darling test', A_s, A_c, A_sl)

    df = x.shape[0] - 1
    Norm_test = pd.DataFrame([['统计量', 'df', 'Sig'], [K_s, df, K_p],
                              [S_s, df, S_p], [None, None, None]],
                             index=["正态性检验", 'kstest', 'Shapiro-Wilk', None])
    s_l = list(map(lambda x: '显著水平 ' + str(x) + '%', A_sl))
    c_v = list(map(lambda x: '临界值 ' + str(x), A_c))

    And_D_test = pd.DataFrame([['统计量', s_l[0], s_l[1], s_l[2], s_l[3], s_l[4]],
                               [A_s, c_v[0], c_v[1], c_v[2], c_v[3], c_v[3]],
                               [None, None, None, None, None, None]],
                              index=["正态性检验", 'Anderson-Darling ', None])
    result = pd.concat([Norm_test, And_D_test])
    return result
Example #35
0
def print_market_information(benchmark):
    print("RETURN BENCHMARK STATISTICS")
    print("---------------------------------------------")
    print("Mean of Daily  Log Returns %9.6f" % np.mean(benchmark['returns']))
    print("Std  of Daily  Log Returns %9.6f" % np.std(benchmark['returns']))
    print("Mean of Annua. Log Returns %9.6f" %
          (np.mean(benchmark['returns']) * 252))
    print("Std  of Annua. Log Returns %9.6f" %
          (np.std(benchmark['returns']) * math.sqrt(252)))
    print("---------------------------------------------")
    print("Skew of Sample Log Returns %9.6f" % scs.skew(benchmark['returns']))
    print("Skew Normal Test p-value   %9.6f" %
          scs.skewtest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Kurt of Sample Log Returns %9.6f" %
          scs.kurtosis(benchmark['returns']))
    print("Kurt Normal Test p-value   %9.6f" %
          scs.kurtosistest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Normal Test p-value        %9.6f" %
          scs.normaltest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Anderson Normality Test:		   ")
    print(stats.anderson(benchmark['returns']))
    return
def test_normality_increase_lambert():
    # Generate random data and check that it is more normal after inference
    for i, y in enumerate([np.random.standard_cauchy(size=ns), experimental_data]):
        print "Distribution %d" % i
        print "Before"
        print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(y)[0], shapiro(y)[0])).expandtabs(30)
        stats.probplot(y, dist="norm", plot=pylab)
        pylab.savefig("%d_before.png" % i)
        pylab.clf()

        tau = g.igmm(y)
        x = g.w_t(y, tau)
        print "After"
        print ("anderson: %0.3f\tshapiro: %0.3f" % (anderson(x)[0], shapiro(x)[0])).expandtabs(30)
        stats.probplot(x, dist="norm", plot=pylab)
        pylab.savefig("%d_after.png" % i)
        pylab.clf()
Example #37
0
File: utils.py Project: farr/arfit
def residual_pvalues(logpost, chain, Nps=10):
    stats = []
    for p in permutation(chain)[:Nps,:]:
        r = logpost.standardised_residuals(p)
        stat, cv, sl = ss.anderson(r, 'norm')
        stats.append(stat)
    stats = np.array(stats)

    return stats, cv, sl
Example #38
0
def plot_coeff_hist(evals, a_fit, nmodes=None, ax_re=None, ax_im=None):
    """
    Plot the histogram of KL coefficients

    For a true gaussian random field, the KL coefficients should be drawn
    from a gaussian with std deviation given by the corresponding eigenvalue.
    We'll check if the data is consistent with that
    """
    if ax_re is None:
        ax_re = pylab.axes()
    if ax_im is None:
        ax_im = pylab.axes()

    i = np.where((evals > 1e-8) & (~np.isnan(evals)))
    evals = evals[i]
    a_fit = a_fit[i]

    a_prime = a_fit / np.sqrt(evals)

    if nmodes is None:
        nmodes = len(a_prime)
    else:
        a_prime = a_prime[:nmodes]

    from scipy.stats import anderson

    print "real"
    print anderson(a_prime.real)
    print "imag"
    print anderson(a_prime.imag)

    pylab.axes(ax_re)
    plot_hist_norm(a_prime.real, bins=30)
    pylab.title("real, n=%i" % nmodes)
    ax_re.set_xlabel(r"$a_i/\lambda_i$")
    ax_re.set_ylabel(r"$N(a_i/\lambda_i)$")
    ax_re.set_xlim(-3, 3)

    pylab.axes(ax_im)
    plot_hist_norm(a_prime.imag, bins=30)
    pylab.title("imag, n=%i" % nmodes)
    ax_im.set_xlabel(r"$a_i/\lambda_i$")
    ax_im.set_ylabel(r"$N(a_i/\lambda_i)$")
    ax_im.set_xlim(-3, 3)
Example #39
0
def noise(fname, x0 = 100, y0 = 100, maxrad = 30):
    from astroML.plotting import hist
    hdulist = pf.open(fname)
    im = hdulist[0].data
    #print np.mean(im), np.min(im), np.max(im)
    #print im[95:105, 95:105]
    # x0, y0 = 100, 100
    xi, yi = np.indices(im.shape)
    R = np.sqrt( (yi - int(y0))**2. + (xi - int(x0))**2. )
    phot_a = np.zeros(maxrad + 1)
    phot_a[0] = 0
    
    bmasked = im * ((R > maxrad) * (R < maxrad + 20.))
    bdata = bmasked.flatten()
    #print bdata[bdata != 0.]
    #print len(bdata[bdata != 0.])
    #print len(bdata)
    
    plt.subplot(3, 1, 1)
    hist(bdata[bdata != 0.], bins = 'blocks')
    plt.xlabel('Flux')
    plt.ylabel('(Bayesian Blocks)')
    plt.title('Noise')
    #plt.show()
    
    plt.subplot(3, 1, 2)
    hist(bdata[bdata != 0.], bins = 50)
    plt.xlabel('Flux')
    plt.ylabel('(50 bins)')
    #plt.title('Noise (50 bins)')
    #plt.show()
    
    plt.subplot(3, 1, 3)
    hist(bdata[bdata != 0.], bins = 'knuth')
    plt.xlabel('Flux')
    plt.ylabel('(Knuth\'s Rule)')
    #plt.title('Noise (Knuth\'s Rule)')
    plt.show()
    
    A2, crit, sig = anderson(bdata[bdata != 0.], dist = 'norm')
    print 'A-D Statistic:', A2
    print ' CVs \t  Sig.'
    print np.vstack((crit, sig)).T

    normality = normaltest(bdata[bdata != 0.])
    print 'Normality:', normality

    skewness = skewtest(bdata[bdata != 0.])
    print 'Skewness:', skewness

    kurtosis = kurtosistest(bdata[bdata != 0.])
    print 'Kurtosis:', kurtosis

    print 'Mean:', np.mean(bdata[bdata != 0.])
    print 'Median:', np.median(bdata[bdata != 0.])
Example #40
0
def freq_statistics(filtered_shell_log, user_info):
    student_input_list = filtered_shell_log.group_by(lambda x: x.user_name).filter_by(lambda x: x[0] in user_info)
    student_input_list = student_input_list.map(lambda x: (x[0], _generate_counter_list(x[1])))
    freq_list = student_input_list.map(lambda x: (x[0], _convert_freq(x[1])))
    feature_list = freq_list.map(lambda x: _generate_feature_vector(x[1]))
    for index in range(len(feature_list[0])):
        stat, crit_vals, sig_level = stats.anderson([item[index] for item in feature_list])
        print 'INPUT {}'.format(index)
        print '\t Statistics: {}'.format(stat)
        print '\t Critical values: {}'.format(crit_vals)
        print '\t Significant levels: {}'.format(sig_level)
Example #41
0
	def _gaussianCheck(self, vector):
		"""
		check whether a given input vector follows a gaussian distribution
		H0: vector is distributed gaussian
		H1: vector is not distributed gaussian
		"""
		output = anderson(vector)

		if output[0] <= output[1][self.strictness]:
			return True
		else:
			return False
def execute(training_data, cv_data, test_data):
    """
    execute is the function where each run is done. main sets parameters
    then calls execute
    """

    clf, regularization_parameter = learn(training_data, cv_data)

    # do an Anderson Darling test on the data to determine if it is a normal fit
    A2, sig, crit = anderson(test_data.y, dist="norm")

    test_mn = np.mean(test_data.y)
    test_sd = np.std(test_data.y)

    predict_data = clf.predict(test_data.X)
    difference = predict_data - test_data.y
    diff_mn = np.mean(difference)
    diff_sd = np.std(difference)

    print("the value for A2 is ", A2)
    print("The mean and standard deviation of the test data are ", test_mn, test_sd)
    print("The mean and standard deviation of the difference are ", diff_mn, diff_sd)

    # make plot
    # correlation coefficent between prediction and actual data
    coef, dummy = pearsonr(predict_data, test_data.y)

    # compare per stock
    plt.plot(predict_data, test_data.y, "ro")
    plt.title("Comparison per stock")
    plt.xlabel("Prediction")
    plt.ylabel("Actual")

    xmin, xmax, ymin, ymax = plt.axis()
    plt.text(
        xmin + (xmax - xmin) / 20.0, ymax - (ymax - ymin) / 20.0, "cor coef: {}".format(coef), verticalalignment="top"
    )

    # draw a line of perfect prediction
    if ymin > xmin:
        xmin = ymin
    if ymax < xmax:
        xmax = ymax
    xs = np.linspace(xmin, xmax)
    plt.plot(xs, xs, "g--")

    # draw the plot
    plt.tight_layout()
    plt.show()
Example #43
0
def norm_test(X,mu,sigma):
    # perform the Wilk-Shapiro and Anderson-Darling tests
    # inputs
    # X     : array of sample data
    # mu    : mean of assumed distribution
    # sigma : stand dev of assumed distribution
    # outputs

    Xs = (X - mu) / sigma

    # Anderson-Darling
    A2, crit, sig = stats.anderson(X,dist='norm')
    ad_pass = (A2 < crit)

    # Wilks-Shapiro
    W, p = stats.shapiro(X)
    ws_pass = (W > p)

    return ad_pass, sig, ws_pass
Example #44
0
 def get_tables(self):
     #count of procedures over time
     count_table = [("Period End",
                     "Number of Procedures",
                     "Procedures with Recorded Fluoro Time")] +\
                     zip(self.bin_edges[1:], self.counts, self.with_fluoro_counts)
     #break down by cpt code
     cpt_table = [("CPT Code Combination","Number of Procedures",
                   "Number of Procedures with Fluoro", "Anderson Value", "Critical Values", "P-values")]
     for cpt, sprocs in self.sprocs_by_cpt.iteritems():
         cpt_table += [['"'+cpt+'"', len(sprocs), len([p for p in sprocs if not p.fluoro is None])]]
         fluoros = [p.fluoro if not p.fluoro ==0 else .5 for p in sprocs if not p.fluoro is None] 
         try:
             fit_statistic = anderson([math.log(float(x)) for x in fluoros],
                                 dist='norm')
         except ZeroDivisionError:
             fit_statistics = None
         cpt_table[-1] += [fit_statistic[0], list(fit_statistic[1]),list(fit_statistic[2])]
     return (count_table, cpt_table)
    def validate(self, symbol, start_date, end_date):
        """
        :param start_date, end_date: YYYY-MM-DD
        """
        data = self.sd.fetch_pd_data(symbol, start_date, end_date)
        daily_changes = data['Adj Close'].astype('float').pct_change(periods=1).tolist()[1:]
        #daily_changes = np.random.normal(1, 0.5, 1000)
        bins = 20
        n, bins, patches = plt.hist(daily_changes, 50, normed=1, facecolor='green', alpha=0.75)

        mu = np.mean(daily_changes)
        sigma = np.std(daily_changes)

        t_stat, p_value = shapiro(daily_changes)
        print 'Shapiro Test'.center(110, '-')
        print 'Mean: %.2f' % mu
        print 'Std:  %.2f' % sigma
        print 't_stat: %.2f' % t_stat
        print 'p_value: %.2f' % p_value
        if p_value > 0.05:
            print 'At 0.05 significance level, Null hypothsis that points are from normal distribution can NOT be rejected!'
        else:
            print 'At 0.05 significance level, Reject Normal distribution!'


        t_stat, critical_v, sig = anderson(daily_changes, 'norm')
        print 'Anderson Test'.center(110, '-')
        print 't_stat: %s' % t_stat
        print 'critial_v: %s' % critical_v
        print 'sig: %s' % sig
        if t_stat > critical_v[2]:
            print 'At 0.05 significance level, Reject Normal distribution!'
        else:
            print 'At 0.05 significance level, Null hypothsis that points are from normal distribution can NOT be rejected!'

        y = mlab.normpdf(bins, mu, sigma)
        l = plt.plot(bins, y, 'r--', linewidth=1)

        plt.grid(True)
        plt.show()
def apply_anderson_darling(dist_name, data_series, debug=False):
    """
    Applies the Anderson-Darling Test for Goodness-of-Fit

    H0: data_series are distributed as dist_name.
    Reject H0 if statistic > critical_value at significance_level

    :param dist_name: Distribution name.
    :param data_series: Data point.
    :return: None.
    """
    if dist_name in ["norm", "expon"]:
        # According to Ivezic Anderson-Darling is better for normal. Also, scipy says this works for exponential too.
        statistic, critical_values, significance_level = stats.anderson(data_series, dist_name)

        if debug:
            print "Anderson-Darling Test for ", dist_name, ": statistic ", statistic
            for critical_value, significance_level in zip(critical_values, significance_level):
                print "Critical Value: ", critical_value, " Significance Level: ", significance_level
    else:
        if debug:
            print "Anderson-Darling is not suitable for ", dist_name
Example #47
0
def DlossHistogram(atoms,index):
	# plot a histogram of the distribution of Dloss values within a structure
	# for a given dataset number. Dmetric specifies density metric

	sns.set_palette("deep", desat=.6)
	sns.set_context(rc={"figure.figsize": (10, 6)})
	fig = plt.figure()
	ax = plt.subplot(1,1,1)

	counter = -1 
	colors = ['red','blue','green']
	for boundType in ['unbound protein','bound protein','rna']:
		counter += 1
		colorType = colors[counter]

		if boundType == 'unbound protein':
			chains = list(string.ascii_lowercase.upper())[:11]
		elif boundType == 'bound protein':
			chains = list(string.ascii_lowercase.upper())[11:22]
		elif boundType == 'rna':
			chains = list(string.ascii_lowercase.upper())[22:23]

		datax = [atm.Bfactorchange for atm in atoms if atm.chaintype in chains]
		plt.hist(datax, 300, histtype="stepfilled", alpha=.7,color=colorType,label=boundType)

		print "Component '{}' length: {}".format(boundType,len(datax))
		# test for normality in data
		print 'Testing for normality in dataset: {}'.format(boundType)
		W,critVal,SigLevel = stats.anderson(datax,dist='norm')
		print 'Test statistic: {}'.format(W)
		print 'critical value: {}'.format(critVal)
		print 'SigLevel: {}'.format(SigLevel)

		plt.xlabel('Bfactor change value')
		plt.ylabel('Frequency of atoms')
		ax.legend(loc='best')
		plt.title('Histrogram of Bfactor change per atom')
		fig.savefig('BfactorchangePerAtom_{}.png'.format(index))
def gaussianClusterTest(r):

   d,n = shape(r)
   # If the dataset is only one point, return a large number to                                                                       
   # indicate that we have over-fit this cluster                                                                                    
   if d<=1:
       return -1,0,0

   # Run k-means with k=2                                                                                                           
   c=2
   centroids,labels = kmeans2(r,c,minit='points')
   # Find the axis along which the two centroids lie                                                                               
   v = centroids[1,:] - centroids[0,:]
   # Get the projection of each point in r along v                                                                                 
   x = empty((n))
   
   x = dot(r,v)/linalg.norm(v)

   AD_stat,AD_criticals,AD_percent = anderson(x,'norm')   
   # This is the 5% significance level
   AD_crit = AD_criticals[2]

   return AD_stat,AD_crit,labels
ref_members = 1000
dist = linspace(-5,5,res)
gridks = zeros((res,len(dist)))
gridad = zeros((res,len(dist)))
m = 0
n = 0
ratio = linspace(ref_members,5*ref_members,res)
for k in dist:
    for r in ratio:
        x1 = randn(ref_members)-k/2
        x2 = randn(r)+k/2
        x = append(x1,x2)
        x = (x-mean(x))/std(x)
        x = sort(x)
        ks = kstest(x,'norm')
        ad = anderson(x,'norm')
        if ks[1] > ks_thresh:
            gridks[n,m] = 1
        elif ks[1] <= ks_thresh:
            gridks[n,m] = 0
        if ad[0] < ad_thresh:
            gridad[n,m] = 1
        elif ad[0] >= ad_thresh:
            gridad[n,m] = 0
        n = n + 1
    
    n = 0
    m = m + 1

pylab.subplot(1,2,1)
pylab.imshow((gridks),origin='lower')
Example #50
0
stats, pvalue = ss.kstest(rvs=bad_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(bad_sample)).cdf)
print('The maximumdistance between CDFs is %.2f.' % stats, end='')
print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue)

##
## Using Anderson-Darling test
## The assumption regarding the distribution of the sample is rejected if the output value
## is larger than the critical values for the required significance level.
## For gumbel distributions, the critical values and significance levels are:
##     [0.456, 0.612, 0.728, 0.843, 0.998]
##     [25.0, 10.0, 5.0, 2.5, 1.0]
## I.e, for a sample to be assumed Gumbel distributed with a significant level of 25%,
## the output values must be < 0.456.
##

stats, critical_values, sign_level = ss.anderson(good_sample, dist='gumbel')
if stats > max(critical_values):
    print('Sample is not Gumbel distributed')
else:
    for i, cv in enumerate(critical_values):
        if stats < cv:
            print('Sample is gumbel distributed for a significance level of %d%%' % sign_level[i])
            break

stats, critical_values, sign_level = ss.anderson(bad_sample, dist='gumbel')
if stats > max(critical_values):
    print('Sample is not Gumbel distributed')
else:
    for i, cv in enumerate(critical_values):
        if stats < cv:
            print('Sample is gumbel distributed for a significance level of %d%%' % sign_level[i])
Example #51
0
 def test_result_attributes(self):
     rs = RandomState(1234567890)
     x = rs.standard_exponential(size=50)
     res = stats.anderson(x)
     attributes = ('statistic', 'critical_values', 'significance_level')
     check_named_results(res, attributes)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Example #53
0
    
    ax.set_yticks([])
    ax.set_xlabel("Score")
    ax.set_ylim(0, .4)
    ax.annotate(genre, xy=(0, .2), ha='left', fontsize=12)

#Interesting Fact 1: The mean of the "Scores" column is 6.3 and the
#median is 6.6,suggesting a few things 1) viewers believe the movie they
#are seeing is usually better than average (i.e. a 5) and 2) the difference
#between the mean and median suggests viewers are more likely to rate a movie very
#bad than very good
#Code is below:
data.describe()
data.score.median()

#Interesting Fact 2: An Anderson-Darling test of normality suggests that
 #the scores column is not normally distributed
score_np_array = np.array(data.score, dtype=pd.Series)
score_array = score_np_array.tolist()
type(score_array)
sp.anderson(score_array, dist='norm')

#Plot 1: The dispersion of the score data appears to increases after each decade
data[data.year % 10 ==0].groupby('year').boxplot(column='score')

#Plot 2: The are a larger frequnecy of scores 7.5+ during the emergence of the New Hollywood featuring
#classics from blockbuster directors Steven Spielberg and Stanley Kubrick among others
data[data.year % 10 ==0].groupby('year').hist(by=data.score)


Example #54
0
def plot_box_resids(fit_model, y_pred, subset=None):
    '''More than you ever wanted to know about your residuals'''
    s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\
               np.var(fit_model.resid)
    if subset:
        s_resid = np.random.choice(s_resid,
                                  replace=False,
                                  size=math.floor(len(s_resid) * subset))
    df = pd.DataFrame(s_resid, columns=['resids'])
    temp_df = pd.DataFrame(y_pred, columns=['target'])
    df = df.join(temp_df)

    if min(y_pred) < -1:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * np.exp(x))))
        y = df['target'].apply(lambda x: np.exp(x))
    else:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * x)))
        y = df['target']

    posit = sorted(df['turnout_bucket'].unique())

    plt.scatter(y, s_resid, alpha=.2)
    slope, intercept = np.polyfit(y, s_resid, 1)
    plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y))
    plt.title('Studentized Residuals vs Prediction')
    plt.xlabel('Predicted Value')
    plt.ylabel('Studentized Residual')
    print 'Slope of best fit line: %s' % slope
    plt.show()

    ax1 = df[['resids', 'turnout_bucket']]\
        .boxplot(by='turnout_bucket', positions=posit, widths=.5)
    plt.title('Residuals versus Turnout')
    plt.xlabel('Turnout Bucket')
    plt.ylabel('Studentized Residuals')
    plt.suptitle('')
    plt.show()

    fig = sm.qqplot(s_resid, line='s')
    plt.title('Q-Q Plot')
    plt.show()

    w, p_val = shapiro(s_resid)
    print 'Shapiro-Wilk P_val is %s, larger the better' % p_val

    k, p_val = normaltest(s_resid)
    print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val

    k, p_val = kstest(s_resid, 'norm')
    print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val

    A, critical, sig = anderson(s_resid)
    print 'Anderson-Darling A2 is %s, smaller the better' % A
    print critical
    print sig

    n, bins, patches = plt.hist(s_resid, 75, normed=1)
    mu = np.mean(s_resid)
    sigma = np.std(s_resid)
    plt.plot(bins, mlab.normpdf(bins, mu, sigma))
    plt.title('Residuals versus a Normal Dist')
    plt.show()

    df['turnout_bucket'].hist(bins=posit, align='left', color='b')
    plt.title('Histogram of Turnout Bucket')
    plt.ylabel('Count')
    plt.xlim(-.5, - .5 + len(posit))

    temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count()
    temp.columns = ['Count']
    plt.show()
    print temp
    for i in xrange(0, 4000, 500):
        fig = plt.figure(figsize=(5, 7))
        fig.subplots_adjust(left=0.13, right=0.95,
                            bottom=0.06, top=0.95,
                            hspace=0.1)
        ax = fig.add_subplot(1, 1, 1)

        avg = np.mean(qsos_m[:, i])
        std = np.std(qsos_m[:, i])
        data = (qsos_m[:, i] - avg) / std

        x = np.linspace(-5, 5, 1000)
        pdf = stats.norm(0, 1).pdf(x)

        A2, sig, crit = stats.anderson(data)
        D, pD = stats.kstest(data, "norm")
        W, pW = stats.shapiro(data)

        mu, sigma = mean_sigma(data, ddof=1)
        median, sigmaG = median_sigmaG(data)

        N = len(data)
        Z1 = 1.3 * abs(mu - median) / sigma * np.sqrt(N)
        Z2 = 1.1 * abs(sigma / sigmaG - 1) * np.sqrt(N)

        print 70 * '_'
        print "  Kolmogorov-Smirnov test: D = %.2g  p = %.2g" % (D, pD)
        print "  Anderson-Darling test: A^2 = %.2g" % A2
        print "    significance  | critical value "
        print "    --------------|----------------"
setup_text_plots(fontsize=8, usetex=True)

#------------------------------------------------------------
# Create distributions

# draw underlying points
np.random.seed(0)
Npts = 1E6
x = np.random.normal(loc=0, scale=1, size=Npts)

# add error for each point
e = 3 * np.random.random(Npts)
x += np.random.normal(0, e)

# compute anderson-darling test
A2, sig, crit = anderson(x)
print "anderson-darling A^2 = %.1f" % A2

# compute point statistics
mu_sample, sig_sample = mean_sigma(x, ddof=1)
med_sample, sigG_sample = median_sigmaG(x)

#------------------------------------------------------------
# plot the results
fig, ax = plt.subplots(figsize=(5, 3.75))
ax.hist(x, 100, histtype='stepfilled', alpha=0.2,
        color='k', normed=True)

# plot the fitting normal curves
x_sample = np.linspace(-15, 15, 1000)
ax.plot(x_sample, norm(mu_sample, sig_sample).pdf(x_sample),
Example #57
0
pdf = [normal_pdf, dual_pdf]
xlims = [(-4, 4), (-4, 10)]


#------------------------------------------------------------
# Compute the statistics and plot the results
fig = plt.figure(figsize=(5, 7))
fig.subplots_adjust(left=0.13, right=0.95,
                    bottom=0.06, top=0.95,
                    hspace=0.1)

for i in range(2):
    ax = fig.add_subplot(2, 1, 1 + i)  # 2 x 1 subplot

    # compute some statistics
    A2, sig, crit = stats.anderson(vals[i])
    D, pD = stats.kstest(vals[i], "norm")
    W, pW = stats.shapiro(vals[i])

    mu, sigma = mean_sigma(vals[i], ddof=1)
    median, sigmaG = median_sigmaG(vals[i])

    N = len(vals[i])
    Z1 = 1.3 * abs(mu - median) / sigma * np.sqrt(N)
    Z2 = 1.1 * abs(sigma / sigmaG - 1) * np.sqrt(N)

    print 70 * '_'
    print "  Kolmogorov-Smirnov test: D = %.2g  p = %.2g" % (D, pD)
    print "  Anderson-Darling test: A^2 = %.2g" % A2
    print "    significance  | critical value "
    print "    --------------|----------------"
Example #58
0
    def fit(self, data):

        magnitude = data[0]
        ander = stats.anderson(magnitude)[0]
        return 1 / (1.0 + np.exp(-10 * (ander - 0.3)))
Example #59
0
import numpy as np
from sklearn import preprocessing
from scipy.stats import anderson


rain = np.load('rain.npy')
rain = .1 * rain
rain[rain < 0] = .05/2
print "Rain mean", rain.mean()
print "Rain variance", rain.var()
print "Anderson rain", anderson(rain)

scaled = preprocessing.scale(rain)
print "Scaled mean", scaled.mean()
print "Scaled variance", scaled.var()
print "Anderson scaled", anderson(scaled)

print len(rain[rain < 0])
binarized = preprocessing.binarize(rain)
print np.unique(binarized), binarized.sum()

lb = preprocessing.LabelBinarizer()
lb.fit(rain.astype(int))
print lb.classes_