Python normaltest Examples, scipy.stats.normaltest Python Examples

Example #1

0

Show file

File: stats.py Project: bioragul/ddg

def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
    '''
    A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
    a variety of metrics.
    :param x_values: A list of values for the X-axis (experimental values).
    :param y_values: A list of values for the X-axis (predicted values).
    :param fcorrect_x_cutoff: See get_xy_dataset_statistics.
    :param fcorrect_y_cutoff: See get_xy_dataset_statistics.
    :param x_fuzzy_range: See get_xy_dataset_statistics.
    :param y_scalar: See get_xy_dataset_statistics.
    :return: A table of statistics.
    '''
    from scipy.stats import pearsonr, spearmanr, normaltest, ks_2samp, kstest, norm
    assert(len(x_values) == len(y_values))
    return dict(
        pearsonr = pearsonr(x_values, y_values),
        spearmanr = spearmanr(x_values, y_values),
        gamma_CC = gamma_CC(x_values, y_values),
        MAE = mae(x_values, y_values),
        normaltestx = normaltest(x_values),
        normaltesty = normaltest(y_values),
        kstestx = kstest(x_values, 'norm'),
        kstesty = kstest(y_values, 'norm'),
        ks_2samp = ks_2samp(x_values, y_values),
        fraction_correct = fraction_correct(x_values, y_values, x_cutoff = fcorrect_x_cutoff, y_cutoff = fcorrect_y_cutoff),
        fraction_correct_fuzzy_linear = fraction_correct_fuzzy_linear(x_values, y_values, x_cutoff = fcorrect_x_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar),
    )

Example #2

0

Show file

File: GraphParse_fanmod.py Project: Jason3424/Network-Motif

def motifStats(data,motifSize,degree, usetotal=False):
	
	for corr in ('corr','lcorr','lacorr'):
		motifsNL = findMotifs(data,('NL',corr), motifSize, degree, usetotal)
		motifsMCI = findMotifs(data,('MCI',corr), motifSize, degree, usetotal)
		motifsAD = findMotifs(data,('AD',corr), motifSize, degree, usetotal)
		
		allMotifs = list(set(motifsNL.keys()) | set(motifsAD.keys()) | set(motifsMCI.keys()))
		
		datatype = "Total" if usetotal else "Percent"
		filename = "result2/{}_ks-stats_size-{}_deg-{}.txt".format(corr+datatype,motifSize,degree)
		with open(filename,'w') as f:
			f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}\n".format('ID','MCI','AD','NORM NL','NORM MCI','NORM AD'))
			for key in allMotifs:
				NLdata = motifsNL.get(key,np.zeros(88))
				MCIdata = motifsMCI.get(key,np.zeros(88))
				ADdata = motifsAD.get(key,np.zeros(88))
				KSstatistic, MCIpvalue = stats.ks_2samp(MCIdata,NLdata)
				KSstatistic, ADpvalue = stats.ks_2samp(ADdata,NLdata)
				k2,NLnorm = stats.normaltest(NLdata)
				k2,MCInorm = stats.normaltest(MCIdata)
				k2,ADnorm = stats.normaltest(ADdata)
				if MCIpvalue<0.01 or ADpvalue<0.01:
					line = "*{0:>9}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n"
				else:
					line = "{0:>10}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n"
				f.write(line.format(str(int(key)),MCIpvalue,ADpvalue,NLnorm,MCInorm,ADnorm))

Example #3

0

Show file

File: checkNormality.py Project: CeasarSS/books

def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    stats.normaltest(data)

    # Or you can check for normality with Kolmogorov-Smirnov test
    _,pVal = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    if pVal > 0.05:
        print('Data are normally distributed')
    
    return pVal

Example #4

0

Show file

File: notebook_normalDistribution.py Project: b-rodrigues/statsintro

def check_normality():
    '''Check if the distribution is normal.'''
    # Are the data normally distributed?
    numData = 100
    data = stats.norm.rvs(myMean, mySD, size=numData)
    stats.normaltest(data)
    _ = stats.probplot(data, plot=plt)
    show()

Example #5

0

Show file

File: RedditDataTool2.py Project: Lanigankev/RedditScapeAnalysisTool

	def TestNormality(self,array):
		array = np.array(array)
		print stats.normaltest(array)
		if array[1] <0.2:
			print "Unlikely to be normally distributed"
			return False
		else:
			print "The dataset is likely to be normally distributed"
			return True

Example #6

0

Show file

File: test_miner.py Project: BaxterEaves/baxcat_cxx

def test_convert_uniform_column_to_normal(miner_df):
    logcf = lambda row, x: norm.logpdf(x[0], 0, 1)
    miner = MInER(miner_df, logcf, ['x_2'], n_models=2, use_mp=False)
    miner.init_models()
    miner.fit(20, 10)

    assert(not np.any(np.isnan(miner._df['x_2'].values)))
    assert(not np.any(np.isnan(miner._df['x_3'].values)))

    assert(normaltest(miner._df['x_2'])[1] > .05)
    assert(normaltest(miner._df['x_3'])[1] < .05)

Example #7

0

Show file

File: new_stats.py Project: janesma/sixonix

def determine_significance(mesa1, mesa2):
    """ Determines if two sets of values are statistically significant.

    In the best case, we can determine a normal distribution, and equal
    variance. Once determined we can use the independent t-test function if the
    values are of equal variance.  If we have normal data, but the variance is
    unequal, the welch t-test is used.
    http://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test
    http://en.wikipedia.org/wiki/Student%27s_t-test#Equal_or_unequal_sample_sizes.2C_unequal_variances

    In the case where we cannot determine normality the mann-whitney u-test is
    desired to be used, but this test is only effective when there are greater
    than 20 samples.
    http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
    """
    # FIXME: Is it possible to determine these things with fewer samples?
    Distribution = Enum('Distribution', 'Normal, Non_normal Unknown')
    normality = Distribution.Normal
    try:
        k2, normal = stats.normaltest(mesa1)
        # FIXME: Unhardcode
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal

        k2, normal = stats.normaltest(mesa2)
        if (normal < NORMAL_CI):
            normality = Distribution.Non_normal
    except ValueError:
        normality = Distribution.Unknown

    equal_variance = is_equal_variance(mesa1, mesa2)

    if args.ttest:
        t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, normality == Distribution.Normal,
                "t-test" if equal_variance else "Welch's")
    elif args.mannwhitney:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        return (p, len(mesa1) < 20 or len(mesa2) < 20, "Mann-Whitney")

    if normality == Distribution.Normal:
        error_handler='raise'
        if np.var(mesa1) == 0 and equal_variance:
            error_handler='ignore'
        with np.errstate(divide=error_handler):
            t, p = stats.ttest_ind(mesa1, mesa2, equal_var=equal_variance)
        return (p, False, "t-test" if equal_variance else "Welch's")
    else:
        u, p = stats.mannwhitneyu(mesa1, mesa2)
        p *= 2  # We want a 2-tailed p-value
        flawed = len(mesa1) < 20 or len(mesa2) < 20
        return (p, flawed, "Mann-Whitney")

Example #8

0

Show file

File: ISP_checkNormality.py Project: ChengduoZhao/statsintro_python

def check_normality():
    '''Check if the distribution is normal.'''
    
    # Set the parameters
    numData = 1000
    myMean = 0
    mySD = 3
    
    # To get reproducable values, I provide a seed value
    np.random.seed(1234)   
    
    # Generate and show random data
    data = stats.norm.rvs(myMean, mySD, size=numData)
    fewData = data[:100]
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    pFewVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['Omnibus']    = stats.normaltest(data)
    _, pFewVals['Omnibus'] = stats.normaltest(fewData)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk']    = stats.shapiro(data)
    _, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)
    
    # Or you can check for normality with Lilliefors-test
    _, pVals['Lilliefors']    = lillifors(data)
    _, pFewVals['Lilliefors'] = lillifors(fewData)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    _, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')
    
    print('p-values for all {0} data points: ----------------'.format(len(data)))
    print(pVals)
    print('p-values for the first 100 data points: ----------------')
    print(pFewVals)
    
    if pVals['Omnibus'] > 0.05:
        print('Data are normally distributed')
    # --- >>> STOP stats <<< ---
    
    return pVals['Kolmogorov-Smirnov']

Example #9

0

Show file

File: stats.py Project: jorjuato/IORstats

def normalityTests(data):
    arr = N.zeros((data.shape[1]+1,data.shape[0]+1),N.object)
    mergeCTOA = concatenateRT(data.copy(), axis=0)
    mergeCTD  = concatenateRT(data.copy(), axis=1)
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            arr[j,i] = normaltest(data[i,j])
    for i, grp in enumerate(mergeCTOA):
        arr[-1,i] = normaltest(grp)
    for i, grp in enumerate(mergeCTD):
        arr[i,-1] = normaltest(grp)
    arr[-1,-1] = normaltest(N.hstack(data.flatten()))
    return arr

Example #10

0

Show file

File: arima.py Project: Judiths/openstack_test

def arima_handler(dta, start, end):
    #dta, x = data.dataHandler('./tmpfile00431',0.5)
    dta = pd.TimeSeries(dta)
    #dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700','2060'))
    dta.index = pd.Index(sm.tsa.datetools.dates_from_range(start,end))
    dta.plot(figsize=(12,8))

    fig = plt.figure(figsize=(12,8))
    ax1 = fig.add_subplot(211)
    fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
    ax2 = fig.add_subplot(212)
    fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2)

    arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit()
    #print(arma_mod20)

    arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit()
    #print(arma_mod30)

    print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)
    print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)

    if arma_mod20.aic < arma_mod30.aic:
        sm.stats.durbin_watson(arma_mod20.resid.values)
        fig = plt.figure(figsize=(12,8))
        ax = fig.add_subplot(111)
        ax = arma_mod20.resid.plot(ax=ax);

        resid = arma_mod20.resid
        stats.normaltest(resid)

        fig = plt.figure(figsize=(12,8))
        ax = fig.add_subplot(111)
        fig = qqplot(resid, line='q', ax=ax, fit=True)

        fig = plt.figure(figsize=(12,8))
        ax1 = fig.add_subplot(211)
        fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
        ax2 = fig.add_subplot(212)
        fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

        r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
        data = np.c_[range(1,41), r[1:], q, p]
        #table = pandas.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
        table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
        #print(table.set_index('lag'))

        predict_sunspots = arma_mod20.predict(str(string.atoi(start)+360),str(string.atoi(end)+5), dynamic=True)
        #print(predict_sunspots)
        return predict_sunspots

Example #11

0

Show file

File: playstation_correlation.py Project: steve84/thesis-src

def calc_correlation(fname):    
    reader = csv.reader(open(fname,"rb"),delimiter='\t')
    next(reader)
    x = list(reader)
    data = np.array(x).astype('float')
    
    normal_a = stats.normaltest(data[:,0])[1]
    normal_b = stats.normaltest(data[:,1])[1]
    
    if (normal_a >= 0.05) & (normal_b >= 0.05):
		# both series are normally distributed
        return stats.pearsonr(data[:,0], data[:,1])[0]
    else:
		# not normally distributed
        return stats.spearmanr(data[:,0], data[:,1])[0]

Example #12

0

Show file

File: mlens3.py Project: stefanv/MLTP

	def inspect_output_by_filter(self,rez,dat,doplot=False,test=False,
	                             sig_clips=[5, 3, 2], sig_test=[False,False,True]):
		p = rez.values()[0][1]
		myoutput = rez.values()[0][0]
		new  = rez.values()[0][2]
		filt = rez.keys()[0]

		ret = {}
		ret.update({"all": self._extract_info(p,myoutput.sd_beta,myoutput)})
		err = dat[2]
		tmp = (dat[1] - self.modelfunc_small_te(p,dat[0]))/err
		dof = tmp.shape[0] -  myoutput.beta.shape[0]
		chisq = (tmp**2).sum()
		ret['all'].update({"ndata": dat[0].shape[0], \
		                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
		                    "normalcy_prob": normaltest(tmp)[1]})

		for s in enumerate(sig_clips):
			if sig_test[s[0]] and not test:
				continue
			sig = s[1]
			# get the indices of those inside and out of the clip area
			tmpisig = (abs(tmp) < sig).nonzero()[0]
			tmpisige = (abs(tmp) > sig).nonzero()[0]
			frac_less_than_sig =  float(tmpisig.shape[0])/dat[0].shape[0]
			# print frac_less_than_sig
			if frac_less_than_sig < 1.0:
				out = self._filt_run([dat[0][tmpisig],dat[1][tmpisig],err[tmpisig]],\
 			 					   	  filt,do_sim=False,vplot=False)
				p        = out[1]
				myoutput = out[0]
				t = "-test" if sig_test[s[0]] else ""
					
				ret.update({"sig" + str(sig) + t: self._extract_info(p,myoutput.sd_beta,myoutput)})
				tmp = (dat[1][tmpisig] - self.modelfunc_small_te(p,dat[0][tmpisig]))/err[tmpisig]
				dof = tmp.shape[0] - myoutput.beta.shape[0]
				chisq = (tmp**2).sum()
				try:
					ntest =  normaltest(tmp)[1]
				except:
					ntest = 0.0
				ret["sig" + str(sig) + t].update({"ndata": dat[0][tmpisig].shape[0], \
				                    "chisq": chisq, "dof": dof, "p_chi": chisqprob(chisq,dof),
				                    "normalcy_prob": ntest, "frac_data_remaining": frac_less_than_sig })
				if doplot:
					plot(dat[0][tmpisige],dat[1][tmpisige],".")
			
		return ret

Example #13

0

Show file

File: test_mstats_basic.py Project: jnothman/scipy

 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
     xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))

Example #14

0

Show file

File: functions.py Project: koosha/stock-anomaly

def replace_outs(df, numOuts, df_outs_ind):
    """
    This has been replaced with "replace_outs2"
    """
    df_out = df.copy()
    out_row_inds, out_col_inds = np.random.randint(0, len(df_outs_ind.index), numOuts), \
                                 np.random.randint(0, len(df_outs_ind.columns), numOuts)

    for row, col in zip(out_row_inds, out_col_inds):
        array_col = df.iloc[:, col].dropna()
        z_score, p_val = stats.normaltest(array_col)

        if p_val > 0.05:  # this means the distribution is normal
            eps = 0.002 * np.random.random_sample(1) - 0.001  # epsilon is a random float in [-0.001, 0.001]
            # *** this threshold should be set in experiments
            df_out.iloc[row, col] = 3 * df.iloc[:, col].std() + eps
            # print("for row {0} and column {1} we have {2} and real val is {3}".format(row, col, df_out.iloc[row, col], df_in.iloc[row, col]))
            df_outs_ind.iloc[row, col] = 1

        else:
            q1, q3, iqr = tukey_vals(array_col)
            tukeyHL = [array_col.mean() + q3 + (3 * iqr), array_col.mean() - q1 - (3 * iqr)]
            df_out.iloc[row, col] = rnd.sample(tukeyHL, 1)
            df_outs_ind.iloc[row, col] = 1

    return df_out, df_outs_ind

Example #15

0

Show file

File: checkNormality.py Project: CeasarSS/books

def check_normality():
    '''Check if the distribution is normal.'''
    # Generate and show a distribution
    numData = 100
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    
    data = stats.norm.rvs(myMean, mySD, size=numData)
    plt.hist(data)
    plt.show()

    # --- >>> START stats <<< ---
    # Graphical test: if the data lie on a line, they are pretty much
    # normally distributed
    _ = stats.probplot(data, plot=plt)
    plt.show()

    pVals = pd.Series()
    # The scipy normaltest is based on D-Agostino and Pearsons test that
    # combines skew and kurtosis to produce an omnibus test of normality.
    _, pVals['omnibus'] = stats.normaltest(data)

    # Shapiro-Wilk test
    _, pVals['Shapiro-Wilk'] = stats.shapiro(data)
    
    # Or you can check for normality with Lilliefors-test
    ksStats, pVals['Lilliefors'] = kstest_normal(data)
    
    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['KS'] = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
    
    print(pVals)
    if pVals['omnibus'] > 0.05:
        print('Data are normally distributed')

Example #16

0

Show file

File: __init__.py Project: kaelfischer/lib_prrsv

    def gStats(self, missingValue=0.0):
        """dict of {geneID: (min,max,mean,median,std,stderr,
        Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...}
        """
        import scipy as S
        import scipy.stats as SS

        rv = {}
        for k, v in self.items():
            # print k,v
            va = S.array(self.gValues(k, missingValue))

            try:
                normaltest = SS.normaltest(va)
            except:
                normaltest = None
            try:
                shapiro = SS.shapiro(va)
            except:
                shapiro = None

            try:
                rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro)
            except:
                print k, va
                raise
        return rv

Example #17

0

Show file

File: describe.py Project: jattenberg/datascience-utilities

def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min())/2),
        present("Range", np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode", stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value", normaltest(np_values)[1])
        ]
    return output

Example #18

0

Show file

File: tp2.py Project: smgonzalez/redes-2015

def main(argv=sys.argv):
    route = Route()
    route.trace(argv[1])

    drtts = []

    # Normal Test #
    for ttl, hop in route.hops.items():
        drtts.append(hop.deltaRTTi)

    normal = stats.normaltest(drtts)
    print("** NormalTest **")
    print("k2: ", normal[0], " p-valor: ", normal[1])

    # Test de Grubbs #
    zscores = calculateZScore(drtts)

    N = len(drtts)
    sampleMean = calculateAverage(drtts)
    standarDeviation = calculateStandardDeviation(drtts)

    # Estadistico
    G = (max(drtts) - sampleMean) / standarDeviation

    criticalValue = tDistribution[N]

    print("** GrubbsTest **")
    print("N: ", N)
    print("G: ", G)
    print("CriticalValue: ", criticalValue)

    if criticalValue != None and G > criticalValue:
        print("El DeltaRTT ", max(drtts), " es el enlace transatlantico")

Example #19

0

Show file

File: S8_twoGroups.py Project: NanoResearch/statsintro_python

def oneGroup():
    '''Test of mean value of a single set of data'''
    
    print('Single group of data =========================================')
    
    # First get the data
    data = np.array([5260, 5470, 5640, 6180, 6390, 6515, 6805, 7515, 7515, 8230, 8770], dtype=np.float)
    checkValue = 7725   # value to compare the data to
    
    # 4.1.1. Normality test
    # We don't need the first parameter, so we just assign the output to the dummy variable "_"
    (_, p) = stats.normaltest(data)
    if p > 0.05:
        print('Data are distributed normally, p = {0}'.format(p))
        
    # 4.1.2. Do the onesample t-test
    t, prob = stats.ttest_1samp(data, checkValue)
    if prob < 0.05:
        print('With the one-sample t-test, {0:4.2f} is significantly different from the mean (p={1:5.3f}).'.\
        format(checkValue, prob))
    else:
        print('No difference from reference value with onesample t-test.')
    
    # 4.1.3. This implementation of the Wilcoxon test checks for the "difference" of one vector of data from zero
    (_,p) = stats.wilcoxon(data-checkValue)
    if p < 0.05:
        print('With the Wilcoxon test, {0:4.2f} is significantly different from the mean (p={1:5.3f}).'.\
        format(checkValue, p))
    else:
        print('No difference from reference value with Wilcoxon rank sum test.')

Example #20

0

Show file

File: stattools.py Project: bashtage/statsmodels

def omni_normtest(resids, axis=0):
    """
    Omnibus test for normality

    Parameters
    ----------
    resid : array-like
    axis : int, optional
        Default is 0

    Returns
    -------
    Chi^2 score, two-tail probability
    """
    # TODO: change to exception in summary branch and catch in summary()
    #   behavior changed between scipy 0.9 and 0.10
    resids = np.asarray(resids)
    n = resids.shape[axis]
    if n < 8:
        from warnings import warn
        warn("omni_normtest is not valid with less than 8 observations; %i "
             "samples were given." % int(n), ValueWarning)
        return np.nan, np.nan

    return stats.normaltest(resids, axis=axis)

Example #21

0

Show file

File: test_source.py Project: Chandra-MARX/marxs

def test_disk_distribution(diskclass, diskpar, n_expected):
    '''This is a separate test from test_disk_radius, because it's a simpler
    to write if we don't have to worry about the inner hole.

    For the test itself: The results should be poisson distributed (or, for large
    numbers this will be almost normal).
    That makes testing it a little awkard in a short run time, thus the limits are
    fairly loose.

    This test is run for several extended sources, incl Gaussian. Stirctly speaking
    it should fail for a Gaussian distribution, but if the sigma is large enough it
    will pass a loose test (and still fail if things to catastrophically wrong,
    e.g. some test circles are outside the source).
    '''

    s = diskclass(coords=SkyCoord(213., -10., unit=u.deg), **diskpar)
    photons = s.generate_photons(1e5)

    n = np.empty(20)
    for i in range(len(n)):
        circ = SkyCoord((213. +  np.random.uniform(-0.1, .1)) * u.degree,
                       (- 10. + np.random.uniform(-0.1, .1)) * u.degree)
        d = circ.separation(SkyCoord(photons['ra'], photons['dec'], unit='deg'))
        n[i] = (d < 5. * u.arcmin).sum()
    s, p = normaltest(n)
    # assert a p value here that is soo small that it's never going to be hit
    # by chance.
    assert p > .05
    # better: Test number of expected photons matches
    # Allow large variation so that this is not triggered by chance
    assert np.isclose(n.mean(), n_expected, rtol=.2)

Example #22

0

Show file

File: models.py Project: glciampaglia/editor-lifecycle

    def gof(self, x, y, ye):
        '''
        Computes GoF test statistics and other diagnostical tests

        Returns:
        --------
        - GoF test: Chi^2, p-value, and ddof
        - Normality of residuals: K^2 and p-value
        '''
        res = {}
        resid = y - self(x)
        chisq = np.sum(((resid) / ye) ** 2)
        ddof = len(x) - len(filter(None, self.errors())) # number of estimated parameters
        chisq_pvalue = chisqprob(chisq, ddof)
        gof = (chisq, chisq_pvalue, ddof)
        resid = normaltest(resid)
        ym = y.mean()
        SStot = np.sum((y - ym) ** 2)
        SSerr = np.sum((y - self(x)) ** 2)
        Rsquared = 1.0 - SSerr / SStot
# Besides being buggy, this test for homoscedasticity is supposed to work only
# for linear regressions, hence is not suited for our case, but I'll keep it
# here until I figure out an alternative. Remember to uncomment the import for
# OLS ontop.
#        regresults = OLS(resid ** 2, np.c_[x, x**2]).fit()
#        LM =regresults.rsquared 
#        LM_pvalue = chisqprob(LM, len(x) - ddof)
#        white = (LM, LM_pvalue)
#        return gof, resid, white 
        return gof, resid, Rsquared

Example #23

0

Show file

File: pairedt.py Project: jontonsoup/airlinepricechecker

def pairedt(pairs, numSamples):
    results = dict()
    t,v = pairs.items()
    diffs = [t[1][x] - v[1][x] for x in range(len(t[1]))]
    plotDiffs(diffs)
    sampleSize = int(len(diffs)/numSamples)
    indices = range(len(diffs))
    random.shuffle(indices)
    mean_diffs = []
    i = 0
    for sample in range(numSamples):
        total_diff = 0
        for x in range(sampleSize):
            index = indices[i]
            total_diff += diffs[index]
            i+=1
        sample_avg = total_diff/float(sampleSize)
        mean_diffs.append(sample_avg)

    #normality check
    nt = stats.normaltest(mean_diffs)
    results['normal_p'] =  format(round(nt[1],4))

    #ttest
    t_prob = stats.ttest_1samp(mean_diffs, 0)
    results['ttest_t'] =  format(round(t_prob[0],4))
    results['ttest_p'] =  format(round(t_prob[1],4))

    #other stats
    results['avg_diff'] =  format(round(np.mean(diffs),4))
    results['numSamples'] = numSamples
    results['sampleSize'] = sampleSize
    results['num_pairs'] = len(pairs['tor'])

    return results

Example #24

0

Show file

File: sb_analyze.py Project: blub123muh/hci

def pearson_or_shapiro(data):
    """pearson_or_shapiro

    Use D'agostino/Pearson if possible (n >= 20), else Shapiro
    :param data:
    """
    return stats.normaltest(data) if len(data) >= 20 else stats.shapiro(data)

Example #25

0

Show file

File: utils.py Project: selenarodriguez/Daedalus

def normality_check(feature_group,output_path):

	if feature_group.isEmpty():
		return False

	
	normal_flag = True
	sk_test = stats.skewtest(feature_group.get_scores())
	kr_test = stats.kurtosistest(feature_group.get_scores()) 
	normaltest = stats.normaltest(feature_group.get_scores())

	temp = '''

			Normality Test P-Values
		------------------------------------
		 Kurtosis   |  {0}
		 Skewness   |  {1}
		 NormalTest |  {2}


	'''

	result = temp.format(kr_test[1],sk_test[1],normaltest[1])

	print result


	tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05)

	return tests

Example #26

0

Show file

File: StatsTest.py Project: hmc-simplification/TextSimplification

def testNormalByWord( word, version ):

	# Let's find the data - first the word
	for i in range( len(data) ):
		if data[i][0][1][0][1] == word:
			thisWord = i
		elif data[i][1][1][0][1] == word:
			thisWord = i

	# Now the version

	# Get the distribution
	if data[thisWord][0][0][1] == version:
		numbers = data[thisWord][0][1][1]
	elif data[thisWord][1][0][1] == version:
		numbers = data[thisWord][1][1][1]

	# Use scipy to check normality
	( chi, p ) = stats.normaltest( numbers )

	print "Chi-squared: " + str( chi )
	print "P-value: " + str( p )

	if p < 0.05:
		print "Not normal with alpha 0.05"
	else:
		print "Normal with alpha = 0.05"

Example #27

0

Show file

File: stattools.py Project: EnricoGiampieri/statsmodels

def omni_normtest(resids, axis=0):
    """
    Omnibus test for normality

    Parameters
    -----------
    resid : array-like
    axis : int, optional
        Default is 0

    Returns
    -------
    Chi^2 score, two-tail probability
    """
    #TODO: change to exception in summary branch and catch in summary()
    #behavior changed between scipy 0.9 and 0.10
    resids = np.asarray(resids)
    n = resids.shape[axis]
    if n < 8:
        return np.nan, np.nan
        return_shape = list(resids.shape)
        del return_shape[axis]
        return np.nan * np.zeros(return_shape), np.nan * np.zeros(return_shape)
        raise ValueError(
            "skewtest is not valid with less than 8 observations; %i samples"
            " were given." % int(n))

    return stats.normaltest(resids, axis=axis)

Example #28

0

Show file

File: preprocess.py Project: swatisaoji1/DataMining-Implementation-Naive-Bayesian-Classifier

def fillMissing1(df, dataType):
    '''
    Args:
        df ( 2d array/ Dict):
                             eg : ('attribute1': [12, 24, 25] , 'attribute2': ['good', 'bad'])
        dataTypes (dict): Dictionary of attribute names of df as keys and values 0/1 
                            indicating categorical/continuous variable eg:  ('attribute1':1, 'attribute2': 0)
                            
    Returns:
        writes a file with missing values replaces.    
    
    
    '''
    dataLabels = list(df.columns.values)
    for eachlabel in dataLabels:
        if dataType[eachlabel] is 1:
            
            # check if data is normal
            _,pval = stats.normaltest(df[eachlabel])
            if(pval < 0.5):
                # if the data is not normal use median of the group to replace the missing
                df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.median()))
            else:
                # if the data is not normal use mean of the group to replace the missing
                df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.mean()))
        else:
            #for categorical data use mode ( the most frequent value ) to replace the missing
            df[eachlabel]= df.groupby('class')[eachlabel].transform(lambda x : x.fillna(x.mode()[0]))
            
    df.to_csv(Globals.MISSING_REPLACED_FILE)
    return df, Globals.MISSING_REPLACED_FILE

Example #29

0

Show file

File: base_stat_test_swfilter.py Project: w495/python-video-shot-detector

 def normal_test(features, **_):
     """
     
     :param features: 
     :param _: 
     :return: 
     """
     return stats.normaltest(features)

Example #30

0

Show file

File: test_mstats_basic.py Project: bulli92/scipy

 def test_normaltest(self):
     for n in self.get_n():
         if n > 8:
             x,y,xm,ym = self.generate_xy_sample(n)
             r = stats.normaltest(x)
             rm = stats.mstats.normaltest(xm)
             assert_almost_equal(r[0],rm[0],10)
             assert_almost_equal(r[1],rm[1],10)

Example #31

0

Show file

File: reddit_weekends.py Project: aauutthh/data-science

def perform_tests(weekends, weekdays):
    weekends_normaltest = stats.normaltest(weekends['comment_count'])
    weekdays_normaltest = stats.normaltest(weekdays['comment_count'])
    levene = stats.levene(weekends['comment_count'], weekdays['comment_count'])
    ttest = stats.ttest_ind(weekends['comment_count'], weekdays['comment_count'])
    return weekends_normaltest, weekdays_normaltest, levene, ttest

Example #32

0

Show file

 def omni(self):
     """
     Omnibus test for normality
     """
     return stats.normaltest(self.e)

Example #33

0

Show file

File: SUM_PythonClass4.py Project: JLefortBesnard/PythonClass

sol.components_  # to get the answer, so the value of each weight in the components
sol.explained_variance_  # the explained variance
sol.explained_variance_ratio_  # the expalined variance in %

# A bit of classical statistics:
from scipy import stats

X1 = np.random.random(
    (20))  # let's create 2 variables with 20 observations each
X2 = np.random.random((20))

X1_stand = stats.zscore(X1)  # Another way to standardize
X2_stand = stats.zscore(X2)

stats.sem(X1)  # standard error of the mean
stats.normaltest(X2)  # test for normality

stats.chisquare(
    [12, 14, 16, 18, 10, 10]
)  # chisquare (each entry represents the category and how many times they appear)
stats.rankdata(X1)  # rank the data, useful for non parametric tests

stats.ttest_ind(X1, X2)  # independant t test
stats.ttest_rel(X1, X2)  # dependant t test

stats.mannwhitneyu(X1, X2)  # Mann Whitney U test (non parametric)
stats.wilcoxon(X1, X2)  # Wilcoxon test (non parametric)

stats.spearmanr(X1, X2)  # spearman correlation

stats.linregress(

Example #34

0

Show file

File: Final_Report(Anova).py Project: blueleen2/Final_Report

# -*- coding: utf-8 -*-
"""
Created on Sun Dec 21 20:36:32 2014

@author: JN
"""

import pandas as pd
import statsmodels.api as sm
import pylab as pl
import scipy.stats as stats

Raw_data = pd.read_csv('C:/Users/JN/Desktop/AnovaData.csv')

print Raw_data.describe()

Raw_data.hist()

pl.show()

print Raw_data.BPM

print stats.normaltest(Raw_data.BPM)

New_Column = ['RSP_Cycle', 'BPM']

New_Raw_Data = Raw_data[New_Column]

print New_Raw_Data

print stats.mstats.kruskalwallis(New_Raw_Data)

Example #35

0

Show file

import scipy.stats as stats
import numpy as np

data = pd.read_csv("data.csv", encoding="gbk")

col1 = data[u'2013年GDP(亿元)']
col2 = data[u'较2012年实际增长率'].dropna().map(lambda x: float(x[:-1]))
sc = (col2 - np.mean(col2)) / np.std(col2)

# powerlaw test

fit = powerlaw.Fit(col1)
R, p = fit.distribution_compare('power_law', 'lognormal')
print 'R', R, 'p', p
print "power_law fit is wrong than to lognormal!"

fig4 = fit.plot_ccdf(linewidth=2)
fit.power_law.plot_ccdf(ax=fig4, color='r', linestyle='--')
fit.lognormal.plot_ccdf(ax=fig4, color='g', linestyle='--')
plt.show()

# norm test

des = stats.describe(col2)

omnibus, p_n = stats.normaltest(col2)

print 'p', p_n, 'it is not a norm distribution however'

plt.hist(col2)
plt.show()

Example #36

0

Show file

def isInt(s):
    try:
        int(s)
        return True
    except ValueError:
        return False


if __name__ == "__main__":

    argFiles = []
    nameFiles = []
    values = []
    result = 0
    results = []
    nameFiles = os.listdir(sys.argv[1])
    nameFiles.remove("source.py")
    with open(os.path.join(sys.argv[1], nameFiles[0]), 'r') as f:
        arg = f.readline()

    values = arg.split(' ')
    if (len(values) >= 20):
        for i in range(len(values)):
            results.append(float(values[i]))

        statist, hi_2 = stats.normaltest(results)
        with open(os.path.join(sys.argv[1], str(1) + "output.txt"), 'w') as f:
            f.write(str(hi_2))
    else:
        exit(-1)

Example #37

0

Show file

def dAgostinaTest(data):
    print(len(data))
    stat, p = normaltest(data)
    print(p)

Example #38

0

Show file

File: figure4efgh_variability_over_labs_full.py Project: histonemark/paper-behavior

                              'bias_r': right_fit['bias'],
                              'lapselow_l': left_fit['lapselow'],
                              'lapselow_r': right_fit['lapselow'],
                              'lapsehigh_l': left_fit['lapsehigh'],
                              'lapsehigh_r': right_fit['lapsehigh'],
                              'nickname': nickname, 'lab': lab})
    biased_fits = biased_fits.append(fits, sort=False)

# %% Statistics
    
stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}

for i, var in enumerate(['threshold_l', 'threshold_r', 'lapselow_l', 'lapselow_r', 'lapsehigh_l',
                         'lapsehigh_r', 'bias_l', 'bias_r']):
    _, normal = stats.normaltest(biased_fits[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(*[group[var].values
                               for name, group in biased_fits.groupby('lab')])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(biased_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(*[group[var].values
                                for name, group in biased_fits.groupby('lab')])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(biased_fits, val_col=var, group_col='lab')

Example #39

0

Show file

File: pnc2d.py Project: gc13141112/pseudonetcdf

def make2ds(args):
    ifiles = args.ifiles
    if len(args.figure_keywords) > 0:
        plt.setp(fig, **args.figure_keywords)
    if len(args.axes_keywords) > 0:
        plt.setp(ax, **args.axes_keywords)
    nborders = len(ax.collections)
    for fi, ifile in enumerate(ifiles):
        variables = args.variables
        if variables is None:
            variables = [
                key for key, var in ifile.variables.items() if var.ndim == 2
            ]
        if len(variables) == 0:
            raise ValueError(
                'Unable to heuristically determin plottable variables; use -v to specify variables for plotting'
            )
        for varkey in variables:
            var = ifile.variables[varkey]
            vals = var[:]
            if args.squeeze:
                vals = vals.squeeze()

            if args.normalize is None:
                from scipy.stats import normaltest
                vmin, vmax = vals.min(), vals.max()
                if normaltest(vals.ravel())[1] < 0.05:
                    cvals = np.ma.compressed(vals)
                    boundaries = np.percentile(cvals, np.arange(0, 110, 10))
                    warn(
                        'Autoselect deciles colormap of %s; override width --norm'
                        % varkey)
                else:
                    boundaries = np.linspace(vmin, vmax, num=11)
                    warn(
                        'Autoselect linear colormap of %s; override width --norm'
                        % varkey)
                if (boundaries.max() /
                        np.ma.masked_values(boundaries, 0).min()) > 10000:
                    formatter = LogFormatter(labelOnlyBase=False)
                else:
                    formatter = None
                norm = BoundaryNorm(boundaries, ncolors=256)
            else:
                norm = eval(args.normalize)
                formatter = None
            if not args.colorbarformatter is None:
                try:
                    formatter = eval(args.colorbarformatter)
                except:
                    formatter = args.colorbarformatter

            vmin, vmax = vals.min(), vals.max()
            if not norm.vmin is None:
                vmin = norm.vmin
            if not norm.vmax is None:
                vmax = norm.vmax

            varunit = getattr(var, 'units', 'unknown').strip()
            vardims = [
                dk for dk, dv in zip(var.dimensions, var.shape) if dv != 1
            ]
            print(varkey, sep='')
            del ax.collections[nborders:]
            if args.swapaxes:
                patches = ax.pcolor(vals.T, norm=norm)
                ax.set_xlabel(vardims[0])
                ax.set_ylabel(vardims[1])
            else:
                patches = ax.pcolor(vals, norm=norm)
                ax.set_xlabel(vardims[1])
                ax.set_ylabel(vardims[0])

            height = vals.shape[0]
            width = vals.shape[1]
            if width >= height:
                orientation = 'horizontal'
            else:
                orientation = 'vertical'
            try:
                cax = cbar.ax
                cax.cla()
            except:
                cax = None
            if vals.max() > vmax and vals.min() < vmin:
                extend = 'both'
            elif vals.max() > vmax:
                extend = 'max'
            elif vals.min() < vmin:
                extend = 'min'
            else:
                extend = 'neither'
            cbar = fig.colorbar(patches,
                                orientation=orientation,
                                cax=cax,
                                extend=extend,
                                format=formatter)
            del cbar.ax.texts[:]
            cbar.set_label(varkey + ' (' + varunit + '; min=%.3g; max=%.3g)' %
                           (var[:].min(), var[:].max()))
            #           if orientation == 'vertical':
            #               cbar.ax.text(.5, 1.05, '%.3g' % var[:].max(), horizontalalignment = 'center', verticalalignment = 'bottom')
            #                cbar.ax.text(.5, -.06, '%.3g ' % var[:].min(), horizontalalignment = 'center', verticalalignment = 'top')
            #            else:
            #                cbar.ax.text(1.05, .5, ' %.3g' % var[:].max(), verticalalignment = 'center', horizontalalignment = 'left')
            #                cbar.ax.text(-.06, .5, '%.3g ' % var[:].min(), verticalalignment = 'center', horizontalalignment = 'right')
            #cbar.update_ticks()
            fmt = 'png'
            outpath = args.outpath
            if len(ifiles) > 1:
                lstr = str(fi).rjust(len(str(len(ifiles))), '0')
            else:
                lstr = ''

            figpath = os.path.join(outpath + varkey + lstr + '.' + fmt)
            if args.interactive:
                csl = PNCConsole(locals=globals())
                csl.interact()

            fig.savefig(figpath)
            if args.verbose > 0: print('Saved fig', figpath)

Example #40

0

Show file

import pandas as pd
from statsmodels.formula.api import ols 
from statsmodels.stats.anova import anova_lm
import scipy.stats as ss

############
# one way
############

# prepare
df = pd.read_csv('oneway.csv')
a = df[df['algo'] == 'a']['ratio']
b = df[df['algo'] == 'b']['ratio']

# 1/4: 正态性
ss.normaltest(a); ss.normaltest(b)

# 2/4: 方差齐性
args=[a,b]
ss.levene(*args)

# F test
ss.f_oneway(*args)

# F test too
model = ols('ratio ~ algo', df).fit()
anovat = anova_lm(model)

Example #41

0

Show file

File: create_violin_plots.py Project: salma1601/DESCRIBING_DATA

def calc_ttest_dict(a, b, paired=False):
    '''
    Calculate the comparison between the two sets of data
    
    Importantly, although the stars will be the same, this code
    accurately applies either a Student's t, Welch's t, or Mann Whitney U
    test
    '''
    # Import what you need
    import numpy as np
    from scipy.stats import ttest_ind, ttest_rel, bartlett, mannwhitneyu, normaltest, wilcoxon
    
    stats_dict = {}
    
    # Mask out the not a numbers
    a = [ x for x in a if not np.isnan(x) ]
    b = [ x for x in b if not np.isnan(x) ]

    # Save number of people in each group
    stats_dict['n'] = (len(a), len(b))
    
    # Conduct test for equal variance
    stats_dict['eqvar'] = bartlett(a, b)
    
    # Conduct test for normality
    stats_dict['normal'] = normaltest(np.hstack([a, b]))
    
    # When you test for equal means (ttest) you have different options
    # depending on if you have equal variances or not. You can also
    # run the non-parametric Mann Whitney U test
    # Alternatively these data may be paired so there's also the
    # paired t-test and the Wilcoxon signed rank test
    
    # All five will be entered in the stats_dict
    
    # Conduct Welch's t-test (unequal variances)
    stats_dict['ttest_uneqvar'] = ttest_ind(a, b, equal_var = False)

    # Conduct standard student's t-test (equal variances)
    stats_dict['ttest_eqvar'] = ttest_ind(a, b, equal_var = True)

    # Conduct mann whitney U test (non-parametric test of medians)
    stats_dict['mannwhitneyu'] = mannwhitneyu(a, b)
    
    if paired:
        # Conduct the paired student's t-test
        stats_dict['ttest_paired'] = ttest_rel(a, b)
    
        # Conduct Wilcoxon signed rank test (non-parametric *paired* test of medians)
        stats_dict['wilcoxon'] = wilcoxon(a, b)

    # Save in the stats dict the various other measures you might
    # want to report
    stats_dict['medians'] = [np.percentile(a, 50), np.percentile(b, 50)]
    stats_dict['percentile25'] = [np.percentile(a, 25), np.percentile(b, 25)]
    stats_dict['percentile75'] = [np.percentile(a, 75), np.percentile(b, 75)]
    stats_dict['means'] = [np.mean(a), np.mean(b)]
    stats_dict['stds'] = [np.std(a), np.std(b)]
    stats_dict['dfs'] = [(np.float(stats_dict['n'][0])-1), (np.float(stats_dict['n'][1])-1)]
    stats_dict['pooled_std'] = np.sqrt( (np.float(stats_dict['dfs'][0])*(np.float(stats_dict['stds'][0])**2)
                                     + np.float(stats_dict['dfs'][1])*(np.float(stats_dict['stds'][0])**2))
                                     / (np.float(stats_dict['dfs'][0]) + np.float(stats_dict['dfs'][1])))
    
    if paired:
        stats_dict['mean_difference'] = np.mean(np.array(b)-np.array(a))
        stats_dict['std_difference'] = np.std(np.array(b)-np.array(a))
        stats_dict['median_difference'] = np.percentile(np.array(b)-np.array(a), 50) 
        stats_dict['percentile25_difference'] = np.percentile(np.array(b)-np.array(a), 25) 
        stats_dict['percentile75_difference'] = np.percentile(np.array(b)-np.array(a), 75)
        stats_dict['cohens_d'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['pooled_std'])
        stats_dict['cohens_d_paired'] = np.float(stats_dict['mean_difference']) / np.float(stats_dict['std_difference'])

    return stats_dict

Example #42

0

Show file

    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat8, p8 = ttest_ind(Parents_2019["soc_omg_1"], Parents_2019["soc_omg_2"])
print('stat=%.3f, p=%.3f' % (stat8, p8))
if p8 > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

#Test all the assumptions

#test whether normal distributions

stat, p = normaltest(Moved_out_2020["attitu_2"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

stat3, p3 = normaltest(Parents_2020["attitu_2"])
print('stat=%.3f, p=%.3f' % (stat3, p3))
if p3 > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

stat9, p9 = normaltest(Moved_out_2019["attitu_2"])
print('stat=%.3f, p=%.3f' % (stat9, p9))

Example #43

0

Show file

from termcolor import colored, cprint
import matplotlib.pyplot as plt

import numpy as np

import numpy as np
from scipy import stats
# Generating a normal distribution sample # with 100 elements
sample = np.random.randn(20)
print(colored(('sample', sample), 'green'))
# normaltest tests the null hypothesis.
out = stats.normaltest(sample)
print('normaltest output')
print('Z-score = ' + str(out[0]))
print('P-value = ' + str(out[1]))
# kstest is the Kolmogorov-Smirnov test for goodness of fit.
# Here its sample is being tested against the normal distribution.
# D is the KS statistic and the closer it is to 0 the better.
out = stats.kstest(sample, 'norm')
print('\nkstest output for the Normal distribution')
print('D = ' + str(out[0]))
print('P-value = ' + str(out[1]))
# Similarly, this can be easily tested against other distributions,
# like the Wald distribution.
out = stats.kstest(sample, 'wald')
print('\nkstest output for the Wald distribution')
print('D = ' + str(out[0]))
print('P-value = ' + str(out[1]))

Example #44

0

Show file

# resample data to create time bars and compare normality tests with tick data
def get_bar_stats(agg_trades):
    vwap = agg_trades.apply(
        lambda x: np.average(x.price, weights=x.shares)).to_frame('vwap')
    ohlc = agg_trades.price.ohlc()
    vol = agg_trades.shares.sum().to_frame('vol')
    txn = agg_trades.shares.size().to_frame('txn')
    return pd.concat([ohlc, vwap, vol, txn], axis=1)


resampled = trades.resample('1Min')
time_bars = get_bar_stats(resampled)

# norrmality test for tick rets
normaltest(tick_bars.price.pct_change().dropna())
# compare to min rets
normaltest(time_bars.vwap.pct_change().dropna())
price_volume(time_bars)

# time bars don't always account for fragmentation of orders. Volume bars offer an alternative perspective
with pd.HDFStore(order_book_store) as store:
    trades = store['{}/trades'.format(stock)]

trades.price = trades.price.mul(1e-4)
trades = trades[trades.cross == 0]
trades = trades.between_time(market_open, market_close).drop('cross', axis=1)
trades.info()
trades_per_min = trades.shares.sum() / (60 * 7.5)  # min per trading day
trades['cumul_vol'] = trades.shares.cumsum()

Example #45

0

Show file

File: main.py Project: jdrcabral/codenation-aceleradev-ds

def q4():
    # Retorne aqui o resultado da questão 4.
    log_weight = np.log(amostra_weight)
    statistic, p_value = sct.normaltest(log_weight)
    return bool(p_value > ALPHA)

Example #46

0

Show file

import numpy as np
from scipy import stats
pts = 1000
np.random.seed(28041990)
a = np.random.normal(0, 1, size=pts)
b = np.random.normal(2, 1, size=pts)
x = np.concatenate((a, b))
k2, p = stats.normaltest(x)
alpha = 0.05
print("p = {:g}".format(p))

if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

Example #47

0

Show file

File: main.py Project: jdrcabral/codenation-aceleradev-ds

def q3():
    # Retorne aqui o resultado da questão 3.    
    statistic, p_value = sct.normaltest(amostra_weight)
    return bool(p_value > ALPHA)

Example #48

0

Show file

File: regression_taskrabbit.py Project: javierkos/TaskRabbit

def get_all_subsets(X, y, mallows=True):
    combs = []
    results = []
    for i in range(1, len(X) + 1):
        els = [list(x) for x in itertools.combinations(X, i)]
        combs.extend(els)
    for comb in combs:
        model = sm.OLS(y, sm.add_constant(X[list(comb)]))
        result = model.fit()
        results.append({
            "model": model,
            "result": result,
            "num_vars": len(comb),
            "vars": X[list(comb)]
        })

    full_mse_res = sm.OLS(y, sm.add_constant(X)).fit().mse_resid
    acceptable_models = {}

    for model in results:
        not_acceptable = False
        for pvalue in model["result"].pvalues:
            if pvalue > 0.05:
                not_acceptable = True
                break
        if not_acceptable:
            continue

        mallows_objective = model["num_vars"]
        curr_mallows = mallow_cp(model, full_mse_res, X.shape[0])
        curr_min = None
        if model["num_vars"] in acceptable_models and len(
                acceptable_models[model["num_vars"]]) > 9:
            curr_min = acceptable_models[model["num_vars"]][-1]["mallows"]

        model["mallows"] = curr_mallows
        model["mallows_diff"] = abs(curr_mallows - mallows_objective)
        if not curr_min is None:
            if model["mallows_diff"] < abs(curr_min - mallows_objective):
                del acceptable_models[model["num_vars"]][-1]
                acceptable_models[model["num_vars"]].append(model)
            else:
                continue
        else:
            if not model["num_vars"] in acceptable_models:
                acceptable_models[model["num_vars"]] = []
            acceptable_models[model["num_vars"]].append(model)

        acceptable_models[model["num_vars"]] = \
            sorted(acceptable_models[model["num_vars"]], key=lambda k: k['mallows_diff'])

    curr_best = None
    for num_vars in acceptable_models:
        for model in acceptable_models[num_vars]:

            if curr_best is None:
                curr_best = model
            else:
                if curr_best["mallows_diff"] > model["mallows_diff"]:
                    curr_best = model

    print(curr_best["result"].summary())
    std = curr_best["model"].exog.std(0)
    std[0] = 1
    tt = curr_best["result"].t_test(np.diag(std))
    print(tt.summary())
    tt.summary_frame()

    fig = plt.figure(figsize=(12, 30))
    sm.graphics.plot_partregress_grid(curr_best["result"])
    plt.savefig("resid_ny.png")
    #plt.show()
    if False:
        fig, ax = plt.subplots(2,
                               2,
                               sharex='col',
                               sharey='row',
                               figsize=(12, 10))
        params = list(dict(curr_best["result"].params).keys())

        n1 = math.floor(len(params) / 2)
        n2 = math.floor(len(params) % 2)

        for i in range(2):
            for j in range(2):
                try:
                    ax[i,
                       j].scatter(curr_best["result"].model.exog[:, i * 2 + j],
                                  curr_best["result"].resid)
                    ax[i, j].set_xlabel(params[i * 2 + j])
                    ax[i, j].set_ylabel("resid")
                    ax[i, j].axhline(y=0, color="black")
                except Exception:
                    break
        plt.savefig("resid_sf.png")
        plt.show()
        #fig = plt.figure(figsize=(12, 10))
        #fig = sm.graphics.plot_regress_exog(curr_best["result"], "per_white", fig=fig)
        fig = sm.graphics.plot_partregress_grid(curr_best["result"], fig=fig)
        fig.gca().set_title("")
        plt.suptitle("")
        plt.savefig("resid_ny.png")
        #plt.show()

    stat, p = shapiro(curr_best["result"].resid)
    print("Shapiro")
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    stat, p = normaltest(curr_best["result"].resid)
    print("D’Agostino’s")
    print('Statistics=%.3f, p=%.3f' % (stat, p))

    stat, p = kstest(curr_best["result"].resid, 'norm')
    print("Kolmogorov-Smirnov")
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    #plot_mallows(acceptable_models)
    return curr_best["result"].rsquared_adj

    for var in curr_best["vars"]:
        coef = curr_best["result"].params[var]
        pos_neg = "pos"
        if coef < 0:
            pos_neg = "neg"
        try:
            dct[var + "_" + pos_neg] += 1
        except Exception:
            dct[var + "_" + pos_neg] = 1

Example #49

0

Show file

File: tsexp.py Project: yeedas/avenir

    #normalcy test
    elif op == "jarqBera":
        jb, jbpv, skew, kurtosis = jarque_bera(data)
        printStat(jb, jbpv, "probably gaussian", "probably not gaussian")
        print(f'skew: {skew}')
        print(f'kurtosis: {kurtosis}')

    #shapiro wilks normalcy test
    elif op == "shapWilk":
        stat, pvalue = shapiro(data)
        printStat(stat, pvalue, "probably gaussian", "probably not gaussian")

    #D’Agostino’s K square  normalcy test
    elif op == "dagast":
        stat, pvalue = normaltest(data)
        printStat(stat, pvalue, "probably gaussian", "probably not gaussian")

    #anderson darling normalcy test
    elif op == "andar":
        result = anderson(data)
        print("stat {:.3f}".format(result.statistic))
        for i in range(len(result.critical_values)):
            sl, cv = result.significance_level[i], result.critical_values[i]
            if int(sl) == 5:
                if result.statistic < cv:
                    print("probably gaussian at the {:.1f} level".format(sl))
                else:
                    print(
                        "probably not gaussian at the {:.1f} level".format(sl))
    #histogram

Example #50

0

Show file

File: main.py Project: Andrezza16/codenation_1

#
# * Esse resultado faz sentido?

# <font size = '5' color = 'green'>Este resultado é qualitativamente igual ao resultado fornecido pelo teste de Shapiro-Wilk, diferindo apenas quantitativamente em relação ao valor p, portanto, faz sentido. </font>

# ## Questão 3
#
# Considerando agora uma amostra de tamanho 3000 da coluna `weight` obtida com a função `get_sample()`. Faça o teste de normalidade de D'Agostino-Pearson utilizando a função `scipy.stats.normaltest()`. Podemos afirmar que os pesos vêm de uma distribuição normal ao nível de significância de 5%? Responda com um boolean (`True` ou `False`).

# In[14]:

sub_weight = get_sample(df, 'weight', n=3000)

# In[15]:

ap_t, ap_pvalue = sct.normaltest(sub_weight)
ap_pvalue

# In[16]:


def q3():
    return (ap_pvalue > 0.05)
    # Retorne aqui o resultado da questão 3.
    pass


q3()

# In[17]:

Example #51

0

Show file

File: suppfig_variability_over_labs_first_biased.py Project: int-brain-lab/paper-behavior

# %% Statistics

stats_tests = pd.DataFrame(columns=['variable', 'test_type', 'p_value'])
posthoc_tests = {}

for i, var in enumerate([
        'perf_easy', 'threshold_l', 'threshold_r', 'threshold_n', 'bias_l',
        'bias_r', 'bias_n'
]):

    # Remove any animals with NaNs
    test_fits = biased_fits[biased_fits[var].notnull()]

    # Test for normality
    _, normal = stats.normaltest(test_fits[var])

    if normal < 0.05:
        test_type = 'kruskal'
        test = stats.kruskal(
            *[group[var].values for name, group in test_fits.groupby('lab')])
        if test[1] < 0.05:  # Proceed to posthocs
            posthoc = sp.posthoc_dunn(test_fits, val_col=var, group_col='lab')
        else:
            posthoc = np.nan
    else:
        test_type = 'anova'
        test = stats.f_oneway(
            *[group[var].values for name, group in test_fits.groupby('lab')])
        if test[1] < 0.05:
            posthoc = sp.posthoc_tukey(test_fits, val_col=var, group_col='lab')

Example #52

0

Show file

    def get_corr_matrix_data(self,
                             options,
                             included_vars=None,
                             extra_vars=None):
        if included_vars is None:
            included_vars = list(self.data)
        if extra_vars is not None:
            included_vars = included_vars + extra_vars
        else:
            extra_vars = []

        categories = [
            c for c in list(self.data)
            if 'date' not in c.lower() and c in included_vars
        ]
        categories.extend(extra_vars)
        categories = list(set(categories))
        categories.sort()
        var_count = len(categories)
        categories_for_label = [
            category.replace("Control Point", "CP") for category in categories
        ]
        categories_for_label = [
            category.replace("control point", "CP")
            for category in categories_for_label
        ]
        categories_for_label = [
            category.replace("Distance", "Dist")
            for category in categories_for_label
        ]

        for i, category in enumerate(categories_for_label):
            if category.startswith('DVH'):
                categories_for_label[i] = category.split("DVH Endpoint: ")[1]

        x_factors = categories_for_label
        y_factors = categories_for_label[::-1]

        s_keys = [
            'x', 'y', 'x_name', 'y_name', 'color', 'alpha', 'r', 'p', 'size',
            'x_normality', 'y_normality', 'group'
        ]
        source_data = {
            'corr': {sk: []
                     for sk in s_keys},
            'line': {
                'x': [0.5, var_count - 0.5],
                'y': [var_count - 0.5, 0.5]
            }
        }

        min_size, max_size = 3, 20
        removed_mrns = set()
        for x in range(var_count):
            for y in range(var_count):
                if x > y and self.group == 1 or x < y and self.group == 2:
                    if categories[x] not in extra_vars and categories[
                            y] not in extra_vars:

                        bad_indices = [
                            i for i, v in enumerate(self.data[categories[x]]
                                                    ['values'])
                            if type(v) in [str, type(None)]
                        ]
                        bad_indices.extend([
                            i for i, v in enumerate(self.data[categories[y]]
                                                    ['values'])
                            if type(v) in [str, type(None)]
                        ])
                        bad_indices = list(set(bad_indices))
                        removed_mrns = removed_mrns.union(
                            set(self.mrns[i] for i in bad_indices))

                        x_data = [
                            v for i, v in enumerate(self.data[categories[x]]
                                                    ['values'])
                            if i not in bad_indices
                        ]
                        y_data = [
                            v for i, v in enumerate(self.data[categories[y]]
                                                    ['values'])
                            if i not in bad_indices
                        ]

                        if x_data and len(x_data) == len(y_data):
                            r, p_value = scipy_stats.pearsonr(x_data, y_data)
                        else:
                            r, p_value = 0, 0
                        if np.isnan(r):
                            r = 0

                        sign = ['neg', 'pos'][r >= 0]
                        color = getattr(
                            options, 'CORRELATION_%s_COLOR_%s' %
                            (sign.upper(), self.group))
                        source_data['corr']['color'].append(color)
                        source_data['corr']['r'].append(r)
                        source_data['corr']['p'].append(p_value)
                        source_data['corr']['alpha'].append(abs(r))
                        source_data['corr']['size'].append((
                            (max_size - min_size) * abs(r)) + min_size)
                        source_data['corr']['x'].append(
                            x + 0.5)  # 0.5 offset due to bokeh 0.12.9 bug
                        source_data['corr']['y'].append(
                            var_count - y -
                            0.5)  # 0.5 offset due to bokeh 0.12.9 bug
                        source_data['corr']['x_name'].append(
                            categories_for_label[x])
                        source_data['corr']['y_name'].append(
                            categories_for_label[y])
                        source_data['corr']['group'].append(self.group)

                        try:
                            x_norm, x_p = scipy_stats.normaltest(x_data)
                        except ValueError:
                            x_p = 'N/A'
                        try:
                            y_norm, y_p = scipy_stats.normaltest(y_data)
                        except ValueError:
                            y_p = 'N/A'

                        source_data['corr']['x_normality'].append(x_p)
                        source_data['corr']['y_normality'].append(y_p)

        return {
            'source_data': source_data,
            'x_factors': x_factors,
            'y_factors': y_factors
        }, removed_mrns

Example #53

0

Show file

def analyze(initDate, finalDate, data_type="daily"):

    exchange = 'CCCAGG'
    completeOnly = True
    exWeekends = False

    # aggregated hourly price for Bitcoin (2000 row limit - use a loop)
    symbol = 'BTCUSD'
    BTCUSD = gd.getCrypto(symbol,
                          initDate,
                          finalDate,
                          exchange,
                          completeOnly,
                          exWeekends,
                          data_type=data_type)

    symbol = 'LTCBTC'
    LTCBTC = gd.getCrypto(symbol,
                          initDate,
                          finalDate,
                          exchange,
                          completeOnly,
                          exWeekends,
                          data_type=data_type)

    symbol = 'ETHBTC'
    ETHBTC = gd.getCrypto(symbol,
                          initDate,
                          finalDate,
                          exchange,
                          completeOnly,
                          exWeekends,
                          data_type=data_type)

    # store to disk
    BTCUSD.to_csv('./csv/BTCUSD.csv')
    LTCBTC.to_csv('./csv/LTCBTC.csv')
    ETHBTC.to_csv('./csv/ETHBTC.csv')

    # convert to pctdiffs
    dBTC = (BTCUSD.diff() / BTCUSD.shift()).dropna()
    dLTC = (LTCBTC.diff() / LTCBTC.shift()).dropna()
    dETH = (ETHBTC.diff() / ETHBTC.shift()).dropna()

    agg = pd.DataFrame([dBTC.Close, dLTC.Close, dETH.Close]).transpose()
    agg.columns = ['dBTC', 'dLTC', 'dETH']

    # check correlations
    cAgg = np.corrcoef(agg.dropna(), rowvar=False)
    vAgg = np.cov(agg.dropna(), rowvar=False)

    # cut bottom 1% and top 1% of data points - prune outliers
    def middle(series, percentile):
        temp = series.sort_values(inplace=False)
        pctLen = int(round(len(temp) * percentile / 2, 0))
        temp = temp[pctLen:len(temp) - pctLen].sort_index()
        return temp

    # test for stationarity
    percentile = .02
    spreadLTC = (dLTC / dBTC).Close.dropna()
    spreadETH = (dETH / dBTC).Close.dropna()

    # sBTC = adfuller(dBTC.Close)
    # sLTCBTC = adfuller(spreadLTC)
    # sIOTBTC = adfuller(spreadIOT)
    # sETHBTC = adfuller(spreadETH)

    # if stationary and correlated, check for normal distribution
    k2, p = stats.normaltest(spreadLTC)  # p <= .05

    mLTC = middle((dLTC / dBTC).Close.dropna(), percentile)
    mETH = middle((dETH / dBTC).Close.dropna(), percentile)

    sdLTC = np.std(mLTC)
    mnLTC = np.mean(mLTC)
    assdLTC = spreadLTC / sdLTC  # not using middles

    # display histogram
    spreadLTC.hist(range=[-20, 20], bins=100)
    assdLTC.hist(range=[-5, 5], bins=100)

    # sanity check
    prunedPct = len(assdLTC[np.abs(assdLTC) >= 3]) / len(assdLTC) + percentile

    # slice into sd levels and check autocorrelations
    def checkAutocorrelations(series, sdbottom, sdtop, lags):
        glomSeries = pd.DataFrame(series)
        for lag in range(1, lags + 1):
            glomSeries = glomSeries.join(pd.DataFrame(series.shift(lag)),
                                         rsuffix=str(lag),
                                         how='outer')
        subSeries = glomSeries[(np.abs(glomSeries.Close) >= sdbottom)
                               & (np.abs(glomSeries.Close) < sdtop)].dropna()
        corrs = np.corrcoef(subSeries, rowvar=False)

        mainCol = subSeries.Close

        winProps = np.empty(0)
        for col in subSeries.columns:
            winners = subSeries[(((mainCol > 0) & (mainCol > subSeries[col]))
                                 | ((mainCol < 0) &
                                    (mainCol < subSeries[col])))]
            winProp = len(winners) / len(subSeries)
            winProps = np.append(winProps, winProp)

        return corrs[0], winProps

    # check autocorrelation
    priorSD = 0
    for thisSD in np.arange(0.25, 5.25, 0.25):
        cor, win = checkAutocorrelations(spreadLTC, priorSD, thisSD, 9)
        print(thisSD, "C", cor)
        print(thisSD, "W", win)
        priorSD = thisSD

    return

Example #54

0

Show file

File: statistics.py Project: myron-z-zhang/python-code

from scipy import stats
import matplotlib.pyplot as plt

generated = stats.norm.rvs(size=900)

print "Mean", "Std", stats.norm.fit(generated)

print "Skewtest", "pvalue", stats.skewtest(generated)

print "Kurtosistest", "pvalue", stats.kurtosistest(generated)

print "normaltest", "pvalue", stats.normaltest(generated)

print "95 percentile", stats.scoreatpercentile(generated, 95)

print "Percentile at 1", stats.percentileofscore(generated, 1)

plt.hist(generated)
plt.show()

Example #55

0

Show file

File: main.py Project: WeslleyCSantos/Aceleradev

def q4():
    # Retorne aqui o resultado da questão 4.
    weight_log = np.log(weight)
    k2, p = sct.normaltest(weight_log)
    return bool(p > alpha)

Example #56

0

Show file

File: lesson31_1.py Project: zhouy5/GeekTime

"""
显著性检验：方差分析（Analysis of Variance，ANOVA，F 检验）
随机性：样本是随机采样但
独立性：来自不同组但样本是相互独立但
正太分布性：组内样本都来自一个正太分布
方差齐性：不同组但方差相等或相近
"""

# 读取数据， d1 对应于算法 a，d2 对应于算法 b
df = pd.read_csv("./oneway.csv")
d1 = df[df['algo'] == 'a']['ratio']
d2 = df[df['algo'] == 'b']['ratio']

# 检验两个水平的正态性
print('---------------- 检验两个水平的正态性 ----------------')
print(ss.normaltest(d1))
print(ss.normaltest(d2))

# 检测两个水平的方差齐性
print('---------------- 检测两个水平的方差齐性 ----------------')
args = [d1, d2]
print(ss.levene(*args))

# F 检验的第一种方法
print('---------------- F 检验的第一种方法 ----------------')
print(ss.f_oneway(*args))

# F 检验的第二种方法
print('---------------- F 检验的第二种方法 ----------------')
model = ols('ratio ~ algo', df).fit()
anovat = anova_lm(model)

Example #57

0

Show file

print(arma_mod30.params)
print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)
# こっちを使うとモデル選択まで行われる(AICが他の方法と大きく異なる？)
# arma_mod30 = sm.tsa.AR(dta).fit(maxlag=15, ic='aic', disp=False)
# print(arma_mod30.params)
# print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)

# check if our model obeys the theory
resid = arma_mod30.resid  # residual
sm.stats.durbin_watson(resid.values)
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
ax = arma_mod30.resid.plot(ax=ax)

# test if the residual obeys the normal distribution
print(stats.normaltest(resid))
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)

# autocorrelation function and PARCOR of residual
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)

r, q, p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
data = np.c_[range(1, 41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))

Example #58

0

Show file

    def evaluate_deviation_from_mean(
            self,
            results_reshaped: np.ndarray,
            result_averaged: np.ndarray,
            value_input: Union[float, np.ndarray],
            no_points_averaged: int = 1) -> np.ndarray:
        dev = results_reshaped - np.tile(
            np.expand_dims(result_averaged, axis=1), (1, no_points_averaged))
        if np.ndim(value_input) == 1:
            plt.figure()
            value_input_formatted = np.expand_dims(value_input, axis=1)
            plt.plot(np.tile(value_input_formatted,
                             reps=(1, no_points_averaged)),
                     dev,
                     marker='.')

            # n_param_values = np.size(dev, axis=0) + 1
            # ax1 = plt.subplot(n_param_values, 1, 1)
            for i in range(0, np.size(dev, axis=0)):
                h3 = plt.figure()
                # plt.subplot(n_param_values, 1, i + 1).\
                plt.hist(dev[i, :],
                         stacked=False,
                         label=str(value_input[i]),
                         density=True)
                # plt.xlim([-0.1, 0.1])
                plt.title(str(value_input[i]))
                if self.get_data_saver() is not None:
                    self.get_data_saver().save_figure(
                        h3, ("deviations_over_parameter%d" % i))

        # else:
        # value_input_formatted = value_input

        h1 = plt.figure()
        plt.hist(np.ravel(dev), 100, density='true')
        mean_est = np.mean(np.ravel(dev))
        std_est = np.std(np.ravel(dev))
        x = np.linspace(np.min(np.ravel(dev)), np.max(np.ravel(dev)))
        random_normal = stats.norm(mean_est, std_est).pdf(x)
        plt.plot(x, random_normal, '--r', label='Fitted normal distribution')
        plt.legend()
        if not self.get_data_saver() is None:
            self.get_data_saver().save_figure(h1,
                                              "histogram_deviations_vadere")
        # plt.savefig('histogram_deviations_vadere.png')

        sm.qqplot(np.ravel(dev), line='s')
        h2 = plt.gcf()
        # plt.savefig('qqplot_deviations_vadere.png')
        if not self.get_data_saver() is None:
            self.get_data_saver().save_figure(h2, "qqplot_deviations_vadere")

        plt.close(h2)

        vadere_logger = logging.getLogger(
            "vaderemodel.evaluate_deviation_from_mean")
        vadere_logger.info("Vadere evaluations: Deviations from average")
        vadere_logger.info("Mean: %f, Std: %f" % (mean_est, std_est))

        # skewtest needs at least 8 samples
        if len(np.ravel(dev)) >= 20:
            alpha = 0.01
            k2, p = stats.normaltest(np.ravel(dev))
            vadere_logger.info("p = {:g}".format(p))
            if p < alpha:  # null hypothesis: x comes from a normal distribution
                vadere_logger.info("The null hypothesis can be rejected")
            else:
                vadere_logger.info("The null hypothesis cannot be rejected")

        return dev

Example #59

0

Show file

            sleep(pause_time)

###############################################################################
# Update Volume and Diff
###############################################################################
if 1:

    diff = cu - cu.shift(1)
    diff = diff.dropna()
    values = diff.eur.values

    step = 20
    alpha = 1e-6
    for i in range(step, values.shape[0], step):
        vals = values[values.shape[0] - step:]
        k2, p = stats.normaltest(vals)
        if p < alpha:  # null hypothesis: x comes from a normal distribution
            print('{}: break'.format(i))
            #print("The null hypothesis can be rejected")
        else:
            #print("The null hypothesis cannot be rejected")
            if i % 3000 == 0:
                print(i)
    os.system('say "Completed"')

###############################################################################
# Plot Everything.  Ratios first
###############################################################################
if 1:

    cu.eur.plot(title='cu')

Example #60

0

Show file

File: F11_7_regModels.py Project: ashhher3/statsintro_python

    b = (6.0 * (n**2.0 - 5.0 * n + 2.0)) / ((n + 7.0) * (n + 9.0))
    b *= np.sqrt((6.0 * (n + 3.0) * (n + 5.0)) / (n * (n - 2.0) * (n - 3.0)))
    A = 6.0 + (8.0 / b) * (2.0 / b + np.sqrt(1.0 + 4.0 / b**2.0))
    z = (1.0 - 2.0 / A) / (1.0 + X * np.sqrt(2.0 / (A - 4.0)))
    z = (1.0 - 2.0 / (9.0 * A)) - z**(1.0 / 3.0)
    z /= np.sqrt(2.0 / (9.0 * A))
    return z


K2 = Z1(S, N)**2.0 + Z2(K, N)**2.0
print('Omnibus: {}'.format(K2))

p = 1.0 - stats.chi2(2).cdf(K2)
print('Pr( Omnibus ) = {}'.format(p))

(K2, p) = stats.normaltest(result.resid)
print('Omnibus: {0}, p = {1}'.format(K2, p))

# ---------------------

JB = (N / 6.0) * (S**2.0 + (1.0 / 4.0) * (K - 3.0)**2.0)
p = 1.0 - stats.chi2(2).cdf(JB)
print('JB-statistic: {:.5f},  p-value: {:.5f}'.format(JB, p))

# ---------------------

X = np.matrix(X)
EV = np.linalg.eig(X * X.T)
print(EV)
CN = np.sqrt(EV[0].max() / EV[0].min())
print('Condition No.: {:.5f}'.format(CN))