Beispiel #1
0
def statistic_tests(name_of_file):
    tab1 = []
    tab2 = []

    list_of_rows = read_file(name_of_file)

    fill_tables(tab1, tab2, list_of_rows)
    print('Rank-Sum')
    print('ranksum column 1:', rank_sum(tab1), 'column 2:', rank_sum(tab2))
    print('Kruskal')
    print(kruskal(tab1, tab2))
    print('ANOVA')
    print(f_oneway(tab1, tab2))
    print('Brunner')
    print(brunnermunzel(tab1, tab2))
    print('Whitney')
    print(mannwhitneyu(tab1, tab2))
    print('Barlet')
    print(barlet_test(tab1, tab2))
    print('Levene')
    print(levene_test(tab1, tab2))
    print('Shapiro')
    print('shapiro column 1:', shapiro(tab1), 'column 2:', shapiro(tab2))
    print('T-Student')
    print(ttest_ind(tab1, tab2))
    print('Lilliefors')
    print('liliefors', 'column 1:', lilliefors(tab1), 'column 2:',
          lilliefors(tab2))
    def test_normality(self):
        res = self.res

        #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test
        #> lt = lillie.test(residuals(fm))
        #> mkhtest(lt, "lilliefors", "-")
        lilliefors1 = dict(statistic=0.0723390908786589,
                           pvalue=0.01204113540102896,
                           parameters=(),
                           distr='-')

        #> lt = lillie.test(residuals(fm)**2)
        #> mkhtest(lt, "lilliefors", "-")
        lilliefors2 = dict(statistic=0.301311621898024,
                           pvalue=1.004305736618051e-51,
                           parameters=(),
                           distr='-')

        #> lt = lillie.test(residuals(fm)[1:20])
        #> mkhtest(lt, "lilliefors", "-")
        lilliefors3 = dict(statistic=0.1333956004203103,
                           pvalue=0.455683,
                           parameters=(),
                           distr='-')

        lf1 = smsdia.lilliefors(res.resid)
        lf2 = smsdia.lilliefors(res.resid**2)
        lf3 = smsdia.lilliefors(res.resid[:20])

        compare_t_est(lf1, lilliefors1, decimal=(14, 14))
        compare_t_est(lf2, lilliefors2, decimal=(14, 14))  # pvalue very small
        assert_allclose(lf2[1], lilliefors2['pvalue'], rtol=1e-10)
        compare_t_est(lf3, lilliefors3, decimal=(14, 1))
        # R uses different approximation for pvalue in last case

        #> ad = ad.test(residuals(fm))
        #> mkhtest(ad, "ad3", "-")
        adr1 = dict(statistic=1.602209621518313,
                    pvalue=0.0003937979149362316,
                    parameters=(),
                    distr='-')

        #> ad = ad.test(residuals(fm)**2)
        #> mkhtest(ad, "ad3", "-")
        adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-')

        #> ad = ad.test(residuals(fm)[1:20])
        #> mkhtest(ad, "ad3", "-")
        adr3 = dict(statistic=0.3017073732210775,
                    pvalue=0.5443499281265933,
                    parameters=(),
                    distr='-')

        ad1 = smsdia.normal_ad(res.resid)
        compare_t_est(ad1, adr1, decimal=(11, 13))
        ad2 = smsdia.normal_ad(res.resid**2)
        assert_(np.isinf(ad2[0]))
        ad3 = smsdia.normal_ad(res.resid[:20])
        compare_t_est(ad3, adr3, decimal=(11, 12))
def check_normality():
    '''Check if the distribution is normal.'''

    # Set the parameters
    numData = 1000
    myMean = 0
    mySD = 3

    # To get reproducable values, I provide a seed value
    np.random.seed(1234)

    # Generate and show random data
    data = stats.norm.rvs(myMean, mySD, size=numData)
    fewData = data[:100]
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    pFewVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['Omnibus'] = stats.normaltest(data)
    _, pFewVals['Omnibus'] = stats.normaltest(fewData)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)

    # Or you can check for normality with Lilliefors-test
    _, pVals['Lilliefors'] = lilliefors(data)
    _, pFewVals['Lilliefors'] = lilliefors(fewData)

    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['Kolmogorov-Smirnov'] = stats.kstest(
        (data - np.mean(data)) / np.std(data, ddof=1), 'norm')
    _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest(
        (fewData - np.mean(fewData)) / np.std(fewData, ddof=1), 'norm')

    print('p-values for all {0} data points: ----------------'.format(
        len(data)))
    print(pVals)
    print('p-values for the first 100 data points: ----------------')
    print(pFewVals)

    if pVals['Omnibus'] > 0.05:
        print('Data are normally distributed')
    # --- >>> STOP stats <<< ---

    return pVals['Kolmogorov-Smirnov']
def check_normality(data: np.ndarray, show_flag: bool = True) -> List[float]:
    """Check if the distribution is normal

    Parameters
    ----------
    data : vector of data to be tested
    show_flag : controls the display of data

    Returns
    -------
    ps : List of p-values for different normality tests
    """

    few_data = data[::10]

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    if show_flag:
        _ = stats.probplot(data, plot=plt)
        plt.show()

    pVals = pd.Series()
    pFewVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['Omnibus'] = stats.normaltest(data)
    _, pFewVals['Omnibus'] = stats.normaltest(few_data)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    _, pFewVals['Shapiro-Wilk'] = stats.shapiro(few_data)

    # Or you can check for normality with Lilliefors-test
    _, pVals['Lilliefors'] = lilliefors(data)
    _, pFewVals['Lilliefors'] = lilliefors(few_data)

    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['Kolmogorov-Smirnov']    = \
            stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    _, pFewVals['Kolmogorov-Smirnov'] = \
        stats.kstest((few_data-np.mean(few_data))/np.std(few_data,ddof=1), 'norm')

    print(f'p-values for all {len(data)} data points: ----------------')
    print(pVals)
    print('p-values for the first 100 data points: ----------------')
    print(pFewVals)

    if pVals['Omnibus'] > 0.05:
        print('Data are normally distributed')
    # --- >>> STOP stats <<< ---

    return pVals
    def test_normality(self):
        res = self.res

        #> library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test
        #> lt = lillie.test(residuals(fm))
        #> mkhtest(lt, "lilliefors", "-")
        lilliefors1 = dict(statistic=0.0723390908786589,
                          pvalue=0.01204113540102896, parameters=(), distr='-')

        #> lt = lillie.test(residuals(fm)**2)
        #> mkhtest(lt, "lilliefors", "-")
        lilliefors2 = dict(statistic=0.301311621898024,
                          pvalue=1.004305736618051e-51,
                          parameters=(), distr='-')

        #> lt = lillie.test(residuals(fm)[1:20])
        #> mkhtest(lt, "lilliefors", "-")
        lilliefors3 = dict(statistic=0.1333956004203103,
                          pvalue=0.20, parameters=(), distr='-')

        lf1 = smsdia.lilliefors(res.resid)
        lf2 = smsdia.lilliefors(res.resid**2)
        lf3 = smsdia.lilliefors(res.resid[:20])

        compare_t_est(lf1, lilliefors1, decimal=(14, 14))
        compare_t_est(lf2, lilliefors2, decimal=(14, 14))  # pvalue very small
        assert_allclose(lf2[1], lilliefors2['pvalue'], rtol=1e-10)
        compare_t_est(lf3, lilliefors3, decimal=(14, 1))
        # R uses different approximation for pvalue in last case

        #> ad = ad.test(residuals(fm))
        #> mkhtest(ad, "ad3", "-")
        adr1 = dict(statistic=1.602209621518313, pvalue=0.0003937979149362316,
                    parameters=(), distr='-')

        #> ad = ad.test(residuals(fm)**2)
        #> mkhtest(ad, "ad3", "-")
        adr2 = dict(statistic=np.inf, pvalue=np.nan, parameters=(), distr='-')

        #> ad = ad.test(residuals(fm)[1:20])
        #> mkhtest(ad, "ad3", "-")
        adr3 = dict(statistic=0.3017073732210775, pvalue=0.5443499281265933,
                    parameters=(), distr='-')

        ad1 = smsdia.normal_ad(res.resid)
        compare_t_est(ad1, adr1, decimal=(11, 13))
        ad2 = smsdia.normal_ad(res.resid**2)
        assert_(np.isinf(ad2[0]))
        ad3 = smsdia.normal_ad(res.resid[:20])
        compare_t_est(ad3, adr3, decimal=(11, 12))
Beispiel #6
0
    def normTest(self, p=0.05):
        # D'Agostino-Pearson Test, sample size 20-50
        if 20 < len(self.data) <= 50:
            p_value = stats.normaltest(self.data)[1]
            name = 'normaltest (D Agostino-Pearson)'
        elif len(self.data) <= 20:
            p_value = stats.shapiro(self.data)[1]
            name = 'shapiro'
        elif 300 >= len(self.data) >= 50:
            # Hubert Lilliefors
            p_value = lilliefors(self.data)
            name = 'lillifors'
        elif len(self.data) > 300:
            p_value = stats.kstest(self.data, 'norm')[1]
            name = 'KStest'

        print('-' * 10, ' NORMAL TEST ', '-' * 10)
        if p_value < p:
            print("USE: ", name)
            print("Conclusion: data are not normally distributed")
            return False
        else:
            print("USE: ", name)
            print("Conclusion: data are normally distributed")
            return True
def normal_test(sample, alpha=0.05, verbose=False):
    # hypothesis test: null hypothesis, the data is gaussian distributed

    # Shapiro-Wilk
    stat, p = shapiro(sample)
    if verbose:
        if p > alpha: print('Shapiro this is Gaussian', p)
        else: print('Shapiro this is NOT Gaussian', p)

    # chisquare
    stat, p = chisquare(sample)
    if verbose:
        if p > alpha: print('Chisquare this is Gaussian', p)
        else: print('Chisquare this is NOT Gaussian', p)

    # lilliefors
    stat, p = lilliefors(sample)
    if verbose:
        if p > alpha: print('Lilliefors this is Gaussian', p)
        else: print('Lilliefors this is NOT Gaussian', p)

    # kolmogorov
    stat, p = kstest(sample, 'norm')
    if verbose:
        if p > alpha: print('Kolmogorov this is Gaussian', p)
        else: print('Kolmogorov this is NOT Gaussian', p)

    # Angostino
    k2, p = normaltest(sample)
    if verbose:
        if p > alpha: print('Angostino this is Gaussian', p)
        else: print('Angostino this is NOT Gaussian', p)

    return p, alpha
def load_analysis(data, L, method, plot=True):
    daily_errors = []
    for day, day_df in data.groupby(data.date.dt.day):
        daily_errors.append(calculate_daily_error(L, day_df[L.column], method))

    daily_errors = np.asarray(daily_errors)

    error_df = pd.DataFrame()
    for step in range(daily_errors.shape[2]):
        step_err = daily_errors[:, :, step].flatten()
        error_df[step + 1] = step_err

    if plot:
        print("*" * 100)
        print(
            "Started load analysis with N={}, signal = {}, method = {}".format(
                L.N, L.column, method.__name__))
        print("*" * 100 + "\n")
        print("****** Statistics ******")
        print(error_df.describe())
        print("\n" + "*" * 30 + " Lilliefors " + "*" * 30)
        print(
            "Lilliefors test-statistic:",
            lilliefors(daily_errors.flatten(), dist="norm")[1],
        )
        estimate_rmse(error_df)
        plot_predictions(L, method)
        plot_boxplot(error_df, method.__name__)
        plot_daily_errors(daily_errors, method.__name__)
        plot_error_hist(daily_errors, method.__name__)
        plt.show()

    return error_df
Beispiel #9
0
    def norm_test(self):
        # D'Agostino-Pearson Test, sample size 20-50
        if 20 < len(self.data) <= 50:
            p_value = stats.normaltest(self.data)[1]
            name = 'normaltest'

        elif len(self.data) <= 20:
            p_value = stats.shapiro(self.data)[1]
            name = 'shapiro'

        elif 300 >= len(self.data) >= 50:
            # Hubert Lilliefors
            p_value = lilliefors(self.data)
            name = 'lillifors'

        elif len(self.data) > 300:
            p_value = stats.kstest(self.data, 'norm')[1]
            name = 'KSTEST'

        if p_value < 0.05:
            print "USE ", name
            print "DATA ARE NOT NORMALLY DISTRIBUTED"
            return False
        else:
            print "USE ", name
            print "DATA ARE NORMALLY DISTRIBUTED"
            return True
Beispiel #10
0
    def normality_test(self, test_type='ks'):
        """
        Perform normality tests for all included variables.
        
        Parameters
        ----------
        test_type : str
        Which normality test to use. Available values: 'ks' (Kolmogorov-Smirnov's test) or 'sw' (Shapiro-Wilk' test)
        """
        if test_type not in ['ks', 'sw']:
            raise ValueError(
                "Unknown normality test type. Possible values: 'ks' (Kolmogorov-Smirnov) ans 'sw' (Shapiro-Wilk)"
            )

        results = {}

        for var in self._variables:
            ser = self._data[var]
            if test_type == 'ks':
                stat, pval = lilliefors(ser.dropna(), pvalmethod='approx')
            elif test_type == 'sw':
                stat, pval = shapiro(ser.dropna())
            results.update({var: [stat, pval]})

        results = pd.DataFrame(results, index=['statistic', 'p-value'])

        return results.T
Beispiel #11
0
def compute_and_print_lilliefors(all_data_raw, label, p_value):
    # Get last fitness train values
    samples = all_data_raw[:, -1]
    # Run lilliefors normality test
    stat, pval = lilliefors(samples)
    if pval < p_value:
        print(f'{label} IS NOT normally distributed (p={pval})')
    else:
        print(f'{label} IS normally distributed (p={pval})')
    return samples
    def fit(self, x, dist='norm', pvalmethod='table'):   
        """Perform the Shapiro-Wilk test for normality.

        Parameters
        ----------
        x : array_like, 1d
            Data to test.
        """
         
        self._statistic, self._p = lilliefors(x, dist=dist, pvalmethod=pvalmethod)
Beispiel #13
0
def testy_norm(lista_gestosci, indeks):
    print('Shapiro-Wilk')  # shapiro-wilk, nie sa normalne
    print(dane_10_lat[indeks][0], stats.shapiro(lista_gestosci[indeks]))
    print('Lilliefors')  # shapiro-wilk, nie sa normalne
    print(dane_10_lat[indeks][0], lilliefors(lista_gestosci[indeks]))
    print('D’Agostino’s K^2 Test')
    # D’Agostino’s K^2 Test, Sample looks Gaussian (fail to reject H0)
    print(dane_10_lat[indeks][0], stats.normaltest(lista_gestosci[indeks]))
    print('Anderson-Darling test')
    print(dane_10_lat[indeks][0], stats.anderson(lista_gestosci[indeks]),
          'norm')
def check_normality(testData, alpha=0.05):
    # 20<样本数<50用normal test算法检验正态分布性
    if 20 < len(testData) < 50:
        normaltest_statistic, normaltest_p = stats.normaltest(
            testData
        )  # https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.stats.normaltest.html
        print(normaltest_statistic, normaltest_p)
        if normaltest_p < alpha:
            print('use normaltest')
            print('data are not normal distributed')
            return False
        else:
            print('use normaltest')
            print('data are normal distributed')
            return True
    # 样本数小于50用Shapiro-Wilk算法检验正态分布性
    if len(testData) < 50:
        shapiro_statistic, shapiro_p = stats.shapiro(
            testData
        )  # Perform the Shapiro-Wilk test for normality. https://docs.scipy.org/doc/scipy-0.18.1/reference/generated/scipy.stats.shapiro.html
        print(shapiro_statistic, shapiro_p)
        if shapiro_p < alpha:
            print("use shapiro:")
            print("data are not normal distributed")
            return False
        else:
            print("use shapiro:")
            print("data are normal distributed")
            return True
    if 300 >= len(testData) >= 50:
        lilliefors_statistic, lilliefors_p = lilliefors(
            testData
        )  # https://blog.csdn.net/qq_20207459/article/details/103000285
        print(lilliefors_statistic, lilliefors_p)
        if lilliefors_p < alpha:
            print("use lillifors:")
            print("data are not normal distributed")
            return False
        else:
            print("use lillifors:")
            print("data are normal distributed")
            return True
    if len(testData) > 300:
        kstest_statistic, kstest_p = scipy.stats.kstest(testData, 'norm')
        print(kstest_statistic, kstest_p)
        if kstest_p < alpha:
            print("use kstest:")
            print("data are not normal distributed")
            return False
        else:
            print("use kstest:")
            print("data are normal distributed")
            return True
Beispiel #15
0
    def secondtextchanged(self):
        try:

            xx = self.plainTextEdit_11.toPlainText()
            xx = xx.split()
            xa = [float(x) for x in xx]

            from scipy import stats
            from scipy.stats import shapiro
            stat1, p1 = shapiro(xa)
            #print('Statistics=%.3f, p=%.3f' % (stat, p))
            from scipy.stats import normaltest
            stat2, p2 = normaltest(xa)
            #print('Statistics=%.3f, p=%.3f' % (stat, p))
            from scipy.stats import chisquare
            stat3, p3 = chisquare(xa)
            from statsmodels.stats.diagnostic import lilliefors
            stat4, p4 = lilliefors(xa)
            from scipy.stats import jarque_bera
            stat5, p5 = jarque_bera(xa)
            from scipy.stats import kstest
            stat6, p6 = kstest(xa, "norm")
            from scipy.stats import skew
            val1 = round(skew(xa), 3)
            from scipy.stats import kurtosis
            val2 = round(kurtosis(xa), 3)
            import statistics
            mm = f'{round(statistics.mean(xa), 3)} ± {round(statistics.stdev(xa), 3)}'
            text = f"Count of data is {len(xa)}\n\n"

            text += f"Mean ± standard deviation: {mm}\n\n"
            text += f"skewness = {val1}\n"
            text += f"kurtosis = {val2}\n\n"
            text += f"Shapiro-Wilk Test:\nstat= {round(stat1, 4)}, p-value= {round(p1, 4)}\n\n"
            text += f"D’Agostino’s K-squared test:\nstat= {round(stat2, 4)}, p-value= {round(p2, 4)}\n\n"
            # text += f"Chi-Square Normality Test:\nstat= {round(stat3, 4)}, p-value= {round(p3, 4)}\n\n"
            text += f"Lilliefors Test for Normality:\nstat= {round(stat4, 4)}, p-value= {round(p4, 4)}\n\n"
            text += f"Jarque–Bera test for Normality:\nstat= {round(stat5, 4)}, p-value= {round(p5, 4)}\n\n"
            # text += f"Kolmogorov-Smirnov test for Normality:\nstat= {round(stat6, 4)}, p-value= {round(p6, 4)}\n\n"
            self.plainTextEdit_12.setPlainText(text)

        except Exception as e:

            print(e)

            QMessageBox.warning(self, "Warning",
                                f"The output not obtained because {e}")
            return
        QMessageBox.information(self, "Information",
                                "The output data generated successfully")
Beispiel #16
0
 def preprocess(self, test_size=0.3):
     
     # cleaning the data
     df = self.raw_df.dropna().drop_duplicates()
     
     if len(df) > 365:
         self.step = "W"
         
     else:
         self.step = "D"
     
     # aggregating data by daily sales
     df = df.resample(self.step).apply(sum)
             
     self.df = df.reset_index()
     self.df.columns = ["ds", "y"]
     
     self.index = int(len(self.df)*test_size)
     
     # ----------------------------------------
     
     #---------- Test of normality ------------
     
     # ----------------------------------------
     
     # Lilliefors Test
     self.lilliefors_D, p = lilliefors(self.df.y)
     
     #Kolmogorov-Smirnov Goodness of Fit Test statistic at 0.05% significance
     self.KS_stat_05 = 1.36 / len(self.df)**0.5        
     
     if self.lilliefors_D > self.KS_stat_05:
         print("[ The H0 normality hypothesis at alpha = 0.05 is rejected ]")
         print("[ Lilliefors test statistic: {:.5f}, Kolmogorov-Smirnov ".format(self.lilliefors_D) +
               "critical value: {:.5f} ]".format(self.KS_stat_05))
         self.normalize = True
                                                                                                     
     
     # Box-Cox transformation
     
     if self.normalize:
         
         self.df = self.df[self.df.y > 0]
         
         x, self.optimal_lambda = stats.boxcox(self.df.y[:-self.index])
         print("[ Applying Box-Cox Transformation. Optimal lambda: {:.5f} ]".format(self.optimal_lambda))
         self.df.y = stats.boxcox(self.df.y, self.optimal_lambda)
def lilliefors_kolmogorov(data):
    """
	Test assumed normal or exponential distribution using Lilliefors’ test.

	Lilliefors’ test is a Kolmogorov-Smirnov test
	with estimated parameters.

	Parameters
    ----------
    data : Array of sample data.

    Returns
    -------
    p-value : The p-value of the test.
	"""
    stat, p_value = lilliefors(data)
    return p_value
def check_normality(data):
    kolmogorov_data = kstest(data, 'norm')
    shapiro_data = shapiro(data)
    lilliefors_data = lilliefors(data)
    df = len(data)
    return {
        'Kolmogorov-Smirnov': {
            'statistic': kolmogorov_data.statistic,
            'df': df,
            'pvalue': kolmogorov_data.pvalue
        },
        'Lilliefors': {
            'statistic': lilliefors_data[0],
            'df': df,
            'pvalue': lilliefors_data[1]
        },
        'Shapiro-Wilk': {
            'statistic': shapiro_data.statistic,
            'df': df,
            'pvalue': shapiro_data.pvalue
        }
    }
    def fit(self, x, dist='norm', pvalmethod='table'):   
        """Performs the statistical test.

        Parameters
        ----------
        x : array_like, 1d
            Data to test.

        dist : {‘norm’, ‘exp’}, optional
            The assumed distribution.

        pvalmethod : {‘approx’, ‘table’}, optional
            The method used to compute the p-value of the test statistic. 
            In general, ‘table’ is preferred and makes use of a very large 
            simulation. ‘approx’ is only valid for normality. if dist = ‘exp’ 
            table is always used. ‘approx’ uses the approximation formula of 
            Dalal and Wilkinson, valid for pvalues < 0.1. If the pvalue is 
            larger than 0.1, then the result of table is returned.

        """
         
        self._statistic, self._p = lilliefors(x, dist=dist, pvalmethod=pvalmethod)
Beispiel #20
0
sns.boxplot(x=enem['TP_SEXO'], y=enem['NU_NOTA_MT'])
plt.xlabel("")
plt.ylabel("Nota de Matemática")
plt.show()

from scipy import stats
from statsmodels.stats import diagnostic

sexo = enem[['TP_SEXO', 'NU_NOTA_MT']]
sexo_f = sexo.query('TP_SEXO == "F"').drop('TP_SEXO',axis=1).dropna()
sexo_m = sexo.query('TP_SEXO == "M"').drop('TP_SEXO',axis=1).dropna()
print(sexo_f.shape[0])
print(sexo_m.shape[0])

print('sexo_f:',diagnostic.lilliefors(sexo_f))
print('sexo_m:',diagnostic.lilliefors(sexo_m))

stats.mannwhitneyu(sexo_f, sexo_m, alternative='two-sided')

sns.boxplot(x=enem['Q025'], y=enem['NU_NOTA_MT'])
plt.xlabel("Tem internet em casa?")
plt.ylabel("Nota de Matemática")
plt.show()

internet = enem[['Q025', 'NU_NOTA_MT']]
internet_n = internet.query('Q025 == "Não"').drop('Q025',axis=1).dropna()
internet_s = internet.query('Q025 == "Sim"').drop('Q025',axis=1).dropna()
print(internet_n.shape[0])
print(internet_s.shape[0])
                               columnData,
                               alternative='greater')

        if print_pval == 1:
            print('Statistic = %.5f, p=%.20f' % (stat, p))
            # p_total = p_total+p
            alpha = alpha_c  #/100 #apply bonferroni correction for 100 tests
            if p > alpha:
                print(
                    'Samples are not significantly different (fail to reject H0)'
                )
            else:
                print('Samples are significantly different  (reject H0)')

    #check if individual distributions are normally distributed with lilliefors
        stat_lillie, p_lillie = lilliefors(columnData, pvalmethod='table')
        # print('Statistics=%.3f, p=%.3f' % (stat, p))
        # interpret

        if p_lillie > 0.05:
            distribution = 'normal'  #print('world ' + layer + ' looks Gaussian (fail to reject H0)')
        else:
            distribution = 'not normal'  #print('world ' + layer + ' does not look Gaussian (reject H0)')

        #calculate medians of each random sample set
        medians = statistics.median(columnData)

        #store p values and statistic in a dataframe, with names of the compared layers
        if '139' in variable_geopark:
            layer = variable_geopark.replace('_stats_139', '')
        else:
Beispiel #22
0
# estatistica
mu, std = scs.norm.fit(lwt)

# Plot the PDF.
xmin, xmax = pl.xlim()
x = np.linspace(xmin, xmax, 100)
p = scs.norm.pdf(x, mu, std)
pl.plot(x, p, 'r--', linewidth=2)

# Teste de hipotese de normalidade com 5% de significancia:
# H0: A amostra provem de uma população normal
# H1: A amostra nao provem de uma distribuicao normal

# Testes de shapiro e lillefors: 
s   = scs.shapiro(lwt)
lil = lilliefors(lwt)

pl.text(225, 0.018, 'Shapiro: '+str(round(s[1], 5) )+'\nLilliefors: '+str(round(lil[1], 5)), bbox=dict(facecolor='red', alpha=0.4), zorder=4 )

pl.savefig("imgs/lowbw/ajuste_normal_lwt.pdf")
pl.show()



bwt.hist(histtype='bar', density=True, ec='black', zorder=2)
pl.xticks(range(709, 5010, 428))
pl.xlabel("Peso da mãe na época da última menstruação (lb)")
pl.ylabel("Frequência")
pl.title("Ajuste de modelo normal a variável bwt")
pl.grid(axis='x')
# pl.xticks(range(10,101,10))
Beispiel #23
0
def eval_lilliefors(data, name: str = ""):
    ksstat, pvalue = lilliefors(data, dist="norm")
    print_hypothesis(alpha, name, ksstat, pvalue)
Beispiel #24
0
def plot_waveforms_wavelet_tranform(waveforms,
                                    cluster_ids=None,
                                    save_file=None,
                                    n_pc=4):
    all_waves = np.vstack(waveforms)
    coeffs = pywt.wavedec(all_waves, 'haar', axis=1)
    all_coeffs = np.column_stack(coeffs)
    k_stats = np.zeros((all_coeffs.shape[1], ))
    p_vals = np.ones((all_coeffs.shape[1], ))
    for i, coef in enumerate(all_coeffs.T):
        if len(np.unique(coef)) == 1:  # to avoid nans
            continue

        try:
            k_stats[i], p_vals[i] = lilliefors(coef, dist='norm')
        except ValueError:
            continue

    # pick best coefficients as ones that are least normally distributed
    # that is lowest p-values from Lilliefors K-S test
    idx = np.argsort(p_vals)
    best_coeffs = all_coeffs[:, idx[:n_pc]]
    data = []
    for i, w in enumerate(waveforms):
        tmp = best_coeffs[:w.shape[0]]
        best_coeffs = best_coeffs[w.shape[0]:]
        data.append(tmp)

    if cluster_ids is None:
        cluster_ids = list(range(len(waveforms)))

    colors = [plt.cm.jet(x) for x in np.linspace(0, 1, len(waveforms))]
    pairs = list(it.combinations(range(n_pc), 2))
    n_cols = 1
    while np.power(n_cols, 2) < len(pairs):
        n_cols += 1

    n_rows = int(np.ceil(len(pairs) / n_cols))
    fig, ax = plt.subplots(nrows=n_rows,
                           ncols=n_cols,
                           figsize=(5 * (n_cols + 1), 5 * n_rows))
    ax = ax.reshape(ax.size)
    for i, p in enumerate(pairs):
        for x, y, z in zip(data, cluster_ids, colors):
            ax[i].scatter(x[:, p[0]],
                          x[:, p[1]],
                          s=3,
                          alpha=0.5,
                          color=z,
                          label=y,
                          marker='o')

        ax[i].set_xlabel('Coefficient %i' % p[0])
        ax[i].set_ylabel('Coefficient %i' % p[1])

    handles, labels = ax[0].get_legend_handles_labels()
    if n_rows * n_cols > len(pairs):
        ax[-1].set_axis_off()
        ax[-1].legend(handles, labels, loc='center', shadow=True)
    else:
        idx = int(((n_cols * (n_rows - 1)) - 1) + np.ceil(n_cols / 2))
        ax[idx].legend(handles,
                       labels,
                       ncol=len(pairs),
                       loc='upper center',
                       bbox_to_anchor=(0.5, -0.05),
                       shadow=True)

    fig.suptitle('Wavelet transform coefficients')
    if save_file:
        fig.savefig(save_file)
        return None, None
    else:
        return fig, ax.reshape((n_rows, n_cols))
Beispiel #25
0
from scipy.stats import norm
from statsmodels.stats.diagnostic import lilliefors
my_data = norm.rvs(size=500)
lilliefors(my_data)
def distribution_hist_outlier_trat(X, trat=False, val_trat='', folder_name=''):
    '''
        histogramas com ajuste de um modelo normal
        trat: se verdadeiro, ativa o tratamento dos outliers e considera limiarespara os valores nos histogramas
    '''

    if folder_name != '':
        try:
            os.mkdir('./imgs/' + folder_name)
        except:
            pass

        try:
            os.mkdir('./imgs/' + folder_name + '/hists')
        except:
            pass

        try:
            os.mkdir('./imgs/' + folder_name + '/hists/csv_trat')
        except:
            pass

    ini = True

    for atr in X.columns:

        atr_name = ''.join(atr.split())

        pl.figure()

        if trat == True:
            try:
                Y = X[atr][(X[atr] > val_trat[atr][0])
                           & (X[atr] < val_trat[atr][1])]

                aux = X[atr][(X[atr] < val_trat[atr][0]) |
                             (X[atr] > val_trat[atr][1])]
                aux.to_csv("./imgs/" + folder_name +
                           "/hists/csv_trat/outliers_" + atr_name + "_" +
                           str(val_trat[atr]) + ".csv",
                           header=False)
            except:
                Y = X[atr]
            trat_tex = '_trat'
        else:
            Y = X[atr]
            trat_tex = ''

        Y.hist(histtype='bar', density=True, ec='black', zorder=2)

        min_ = int(round(Y.min() - 0.5))
        max_ = int(round(Y.max() + 0.5))

        #        print(min_)
        #        print(max_)
        #        print(atr)

        step = round((max_ - min_) / 10 + 0.5)

        pl.xticks(range(min_, max_, max(1, step)))

        pl.xlabel(atr)
        pl.ylabel("Frequência")

        pl.title("Histograma " + atr)
        pl.grid(axis='x')

        # estatistica
        mu, std = scs.norm.fit(Y)

        # Plot the PDF.
        xmin, xmax = pl.xlim()
        x = np.linspace(xmin, xmax, 100)
        p = scs.norm.pdf(x, mu, std)
        pl.plot(x, p, 'r--', linewidth=2)

        #print(mu, std)
        #print(x)

        # Teste de hipotese de normalidade com 5% de significancia:
        # H0: A amostra provem de uma população normal
        # H1: A amostra nao provem de uma distribuicao normal

        # Testes de shapiro e lillefors:
        s = scs.shapiro(Y)
        lil = lilliefors(Y)

        ymin, ymax = pl.ylim()
        pl.text(xmin + xmin * 0.01,
                ymax - ymax * 0.12,
                'Shapiro: ' + str(round(s[1], 5)) + '\nLilliefors: ' +
                str(round(lil[1], 5)),
                bbox=dict(facecolor='red', alpha=0.4),
                zorder=4)

        if ini == True:
            D = pd.DataFrame(Y.describe())
            ini = False
        else:
            D.loc[list(Y.describe().index), atr] = Y.describe()

        D.loc['skewness', atr] = scs.skew(Y)
        D.loc['kurtosis', atr] = scs.kurtosis(Y, fisher=False)

        pl.tight_layout()
        pl.savefig("imgs/" + folder_name + "/hists/" + atr_name + "_" +
                   trat_tex + ".png")
        #        pl.show()

        pl.close()

    D.to_csv('imgs/' + folder_name + '/hists/descricao_resumo' + trat_tex +
             '.csv')
Beispiel #27
0
    # Plot the PDF.
    xmin, xmax = pl.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = scs.norm.pdf(x, mu, std)
    pl.plot(x, p, 'r--', linewidth=2)

    print(mu, std)
    print(x)

    # Teste de hipotese de normalidade com 5% de significancia:
    # H0: A amostra provem de uma população normal
    # H1: A amostra nao provem de uma distribuicao normal

    # Testes de shapiro e lillefors:
    s = scs.shapiro(Y)
    lil = lilliefors(Y)

    ymin, ymax = pl.ylim()
    pl.text(xmin + xmin * 0.01,
            ymax - ymax * 0.12,
            'Shapiro: ' + str(round(s[1], 5)) + '\nLilliefors: ' +
            str(round(lil[1], 5)),
            bbox=dict(facecolor='red', alpha=0.4),
            zorder=4)
    pl.tight_layout()
    pl.savefig('teste_hipotese_' + Y.name + '.png')
    pl.show()
    pl.close()

pl.figure(figsize=(10, 8))
Y = dados.iloc[:, -1]
Beispiel #28
0
import numpy as np
from scipy import stats
from statsmodels.stats.diagnostic import lilliefors
import matplotlib.pyplot as plt

x = stats.norm.rvs(0, 10, size=100)
print(stats.kstest(x, 'norm', args=(np.mean(x), np.sqrt(np.var(x)))))
print(lilliefors(x, dist='norm'))

p_ks, p_l = [], []
for i in range(1000):
    x = stats.norm.rvs(0, 10, size=100)
    p_ks.append(
        stats.kstest(x, 'norm', args=(np.mean(x), np.sqrt(np.var(x))))[1])
    p_l.append(lilliefors(x, dist='norm')[1])

X = np.linspace(0, 1, 1000)
p_ks, p_l = sorted(p_ks), sorted(p_l)
lab = ['ks', 'lilliefors']
fig = plt.subplots(figsize=(18, 12), dpi=400)
plt.plot(X, p_ks)
plt.plot(X, p_l)
plt.legend(lab)
Beispiel #29
0
while True:
    r = arq.readline().split()
    if r != []:
        husbands.append(r)
    else:
        break

hus = pd.DataFrame(husbands[1:], columns=husbands[0])
hus = hus.astype(int)

# In[]
pl.figure()

# Idade marido
par = hus.ageh[hus.ageh > 0]
lil = lilliefors(par)
par.hist(density=True, histtype='bar', ec='black', color='w')
#pl.text(19, 27, 'Shapiro: '+str(round(scs.shapiro(par)[1], 5) ) +'\nKS:'+str(round(scs.kstest(par, 'norm')[1], 5) ), bbox=dict(facecolor='red', alpha=0.1) )
pl.grid()
pl.text(
    17,
    0.031,
    'Shapiro: ' +
    str(round(scs.shapiro(par)[1],
              4))  #+'\nKS: '+str(round(scs.kstest(par, 'norm')[1], 4))
    + '\nLillie: ' + str(round(lil[1], 4)),
    bbox=dict(facecolor='red', alpha=0.1))
pl.title('Idade dos Maridos')
pl.xlabel('Idade')
pl.ylabel('Probabilidade')
def L(x, alfa=0.05):
    D, p_l = lilliefors(x, 'norm')#, args=(0, 1))
    if p_l < alfa:
        return 0
    else:
        return 1
Beispiel #31
0
y[1]

df40 = df[df.Age > 35]
df40.PercentSalaryHike.sample(50).hist()
y = anderson(df40.PercentSalaryHike.sample(30))

print(y[0])
print(y[1])
print(y[2] / 100)

shapiro(df.Age)
shapiro(df.Age.sample(50))
shapiro(df40.PercentSalaryHike)

lilliefors(df.Age)
lilliefors(df.Age.sample(50))
lilliefors(df40.PercentSalaryHike)
lilliefors(df40.PercentSalaryHike.sample(50))

#Min/Max value testing on skewed dist - sampled
import seaborn as sns
from scipy.stats import probplot
import matplotlib.pyplot as plt
from scipy.stats import kurtosis
anderson_statistic_30 = []
for i in range(1, 1000):
    anderson_statistic_30.append(
        anderson(df40.PercentSalaryHike.sample(30))[0])

sns.distplot(anderson_statistic_30)