Example #1
0
def infer_ks_test_goodness(l1):
    # l = np.histogram(l1)
    # n = len(l)
    mean = average(l1)
    sigma = std(l1)
    res = kstest(l1, 'norm', [mean, sigma])
    if res[1] < 0.01:
        print('reject')
    else:
        print('accept')
    print(res)
Example #2
0
def infer_ks_test_goodness(l1):
    # l = np.histogram(l1)
    # n = len(l)
    mean = average(l1)
    sigma = std(l1)
    res = kstest(l1, 'norm', [mean, sigma])
    if res[1] < 0.01:
        print('reject')
    else:
        print('accept')
    print(res)
Example #3
0
def logistic(dataset,name):
	"Logistic continous distribution and and throws KS Test Statistic either D,D+,D-  test and p value as a result "
	if (dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'):
		dataset[name].dropna()
		x = np.array(dataset[name])
		z, p = stats.kstest(x, 'logistic')
		if (p < 0.055):
			print 'It is Not a Exponential distribution'
		else:
			print 'It is a Exponential distribution'
		return z, p
	else:
		return None
Example #4
0
def typedis(dataset,name,dis):
	"Type any type of ditribution . Dis is used to take in the type of code distribution visit refer http://docs.scipy.org/doc/scipy-0.14.0/reference/stats.html#module-scipy.stats for more reference and throws KS Test Statistic either D,D+,D-  test and p value as a result "
	if (dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'):
		dataset[name].dropna()
		x = np.array(dataset[name])
		z, p = stats.kstest(x, dis)
		if (p < 0.055):
			print 'It is Not as',dis,' distribution'
		else:
			print 'It is a',dis,'distribution'
		return z, p
	else:
		return None
Example #5
0
def norm(dataset,name):
	"Normal test for normal distribution and throws KS Test Statistic either D,D+,D-  test and p value as a result "
	if(dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'):
		dataset[name].dropna()
		x = np.array(dataset[name])
		z,p = stats.kstest(x,'norm')
		if(p<0.055):
			print 'It is Not a normal distribution'
		else:
			print 'It is a normal distribution'
		return z,p
	else:
		return None
Example #6
0
def welisberg(dataset,name):
	"Weibull continous distribution and throws KS Test Statistic either D,D+,D-  test and p value as a result"
	if (dataset[name].dtype == 'int64' or dataset[name].dtype == 'float64'):
		dataset[name].dropna()
		x = np.array(dataset[name])
		z, p = stats.kstest(x, 'dweibull')
		if (p < 0.055):
			print 'It is Not a Weibull distribution'
		else:
			print 'It is a weibull distribution'
		return z, p
	else:
		return None
Example #7
0
    def test_linear_studentt_parent_dist(self, graph):
        """
        Kolmogorov-Smirnov test for data coming from a student-t (degree of freedom = 3).
        """
        np.random.seed(10)

        data = generate_continuous_data(graph,
                                        distribution="student-t",
                                        noise_scale=1,
                                        n_samples=100000,
                                        seed=10)

        x = data[:, 0]
        _, p_val = stats.kstest(x, "t", args=[3])
        assert p_val < 0.01
Example #8
0
def freq(df,col,max1):
    "To find the required freq for the decompostion "

    count = None
    for i in range(1,max1):
        try:
            decomposed = seasonal_decompose(df[col].values, freq=i)
            decomposed.resid = decomposed.resid[[~np.isnan(decomposed.resid)]]
            print decomposed.resid
        ##decomposed.resid = [1,2,1,2,1,2]
            x = np.array(decomposed.resid)
            z,p = stats.kstest(x,'norm')
            if(p<0.055):
              print 'It is not the required freq'
            else:
                print 'it is the required freq'
                count = i
        except ValueError:
            pass
    decompose(df,col,i)
    return count
def main():

    finaldatafile = "finaldata.json"
    finalData = None
    try:
        with open(finaldatafile) as data_file:
            finalData = json.load(data_file)
    except:
        print("Run analysis")
        exit()


    for appliName in finalData:
        cgscore, issuescore, classSize = finalData[appliName]
        j = 0
        issueCallgraphValueForStats = []
        callGraphValueForStats = []

        issueSizeValueForStats = []
        classSizeValueForStats = []
        
        issueForModel = []
        callGraphForModel = []
        classSizeForModel = []

        for key in issuescore:
            if key in cgscore:
                j+=1
                issueCallgraphValueForStats.append(issuescore[key])
                callGraphValueForStats.append(cgscore[key])

        for key in issuescore:
            if key in classSize:
                issueSizeValueForStats.append(issuescore[key])
                classSizeValueForStats.append(classSize[key])
                
        for key in issuescore:
            if key in classSize:
                if key in cgscore:
                    issueForModel.append(issuescore[key])
                    callGraphForModel.append(cgscore[key])
                    classSizeForModel.append(classSize[key])

        if j>3:
            spearmanCorrelationCoefficient, spearmanpvalue = spearmanr(issueCallgraphValueForStats,callGraphValueForStats)
            kendalltauCorrelationCoefficient, kendalltaupvalue = kendalltau(issueCallgraphValueForStats,callGraphValueForStats)
            kstestdissueValueForStats, kstestpvalueissueValueForStats = kstest([issuescore[key] for key in issuescore],"norm")
            kstestdcgValueForGraph, kstestpvaluecgValueForGraph = kstest([cgscore[key] for key in cgscore],"norm")

            spearmanCorrelationCoefficient2, spearmanpvalue2 = spearmanr(issueSizeValueForStats,classSizeValueForStats)
            kendalltauCorrelationCoefficient2, kendalltaupvalue2 = kendalltau(issueSizeValueForStats,classSizeValueForStats)
            kstestdchissueSizeValueForStats, kstestpvaluechissueSizeValueForStats = kstest([issuescore[key] for key in issuescore],"norm")
            kstestdclassSizeValueForStats, kstestpvalueclassSizeValueForStats = kstest([classSize[key] for key in classSize],"norm")

            print(appliName)
            print("--- API Call <> Issue")
            print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient))
            print(" "*8 + "Spearman p-value = " + str(spearmanpvalue))
            print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient))
            print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue))
            print(" "*8 + "KS Test D = " + str(kstestdissueValueForStats))
            print(" "*8 + "KS p-value = " + str(kstestpvalueissueValueForStats))
            print(" "*8 + "KS Test D = " + str(kstestdcgValueForGraph))
            print(" "*8 + "KS p-value = " + str(kstestpvaluecgValueForGraph))
            print(" "*8 + "dataset size =" + str(j))
            print("--- Class Size <> Issue")
            print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanCorrelationCoefficient2))
            print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2))
            print(" "*8 + "Kendall Tau = " + str(kendalltauCorrelationCoefficient2))
            print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2))
            print(" "*8 + "KS Test D = " + str(kstestdchissueSizeValueForStats))
            print(" "*8 + "KS p-value = " + str(kstestpvaluechissueSizeValueForStats))
            print(" "*8 + "KS Test D = " + str(kstestdclassSizeValueForStats))
            print(" "*8 + "KS p-value = " + str(kstestpvalueclassSizeValueForStats))

            y = issueForModel
            X = np.array([callGraphForModel,classSizeForModel]).transpose()
            X = list([list(i) for i in X])
            model = sm.OLS(y, X)
            results = model.fit()
            print(results.summary(yname="issues", xname =("APIcalls", "ClassSize")))

        else:
            print("FAILURE : " + appliName)

    print("|" * 80)
    print("-" * 80)
    print("-" * 80)
    print("|" * 80)

    issueForGlobalModel = []
    callGraphForGlobalModel = []
    classSizeForGlobalModel = []
    issueGlobalCallgraphValueForStats = []
    callGlobalGraphValueForStats = []
    NOissueGlobalCallgraphValueForStats = []
    issueGlobalSizeValueForStats = []
    classGlobalSizeValueForStats = []

    anova1issue = []
    anova2issue = []
    for appliName in finalData:
        cgscore, issuescore, classSize = finalData[appliName]
        for key in issuescore:
            if key in classSize:
                if key in cgscore:
                    issueForGlobalModel.append(issuescore[key])
                    callGraphForGlobalModel.append(cgscore[key])
                    classSizeForGlobalModel.append(issuescore[key])

        for key in issuescore:
            if key in cgscore:
                j+=1
                issueGlobalCallgraphValueForStats.append(issuescore[key])
                callGlobalGraphValueForStats.append(cgscore[key])
            else:
                NOissueGlobalCallgraphValueForStats.append(issuescore[key])

        for key in cgscore:
            if key in issuescore:
                anova1issue.append(cgscore[key])
            else:
                anova2issue.append(cgscore[key])


        for key in issuescore:
            if key in classSize:
                issueGlobalSizeValueForStats.append(issuescore[key])
                classGlobalSizeValueForStats.append(classSize[key])


    spearmanGlobalCorrelationCoefficient, spearmanpvalueGlobal = spearmanr(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats)
    kendalltauGlobalCorrelationCoefficient, kendalltaupvalueGlobal = kendalltau(issueGlobalCallgraphValueForStats,callGlobalGraphValueForStats)

    spearmanGlobalCorrelationCoefficient2, spearmanpvalue2Global = spearmanr(issueGlobalSizeValueForStats,classGlobalSizeValueForStats)
    kendalltauGlobalCorrelationCoefficient2, kendalltaupvalue2Global = kendalltau(issueGlobalSizeValueForStats,classGlobalSizeValueForStats)


    fvalueanova1, pvalueanova1 = f_oneway(issueGlobalCallgraphValueForStats, NOissueGlobalCallgraphValueForStats)

    fvalueanova2, pvalueanova2 = f_oneway(anova1issue, anova2issue)

    print(len(NOissueGlobalCallgraphValueForStats))
    print("--- Correlation : API Call <> Issue")
    print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient))
    print(" "*8 + "Spearman p-value = " + str(spearmanpvalueGlobal))
    print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient))
    print(" "*8 + "Kendall p-value = " + str(kendalltaupvalueGlobal))
    print(" "*8 + "ANOVA F-value = " + str(fvalueanova1))
    print(" "*8 + "ANOVA p-value = " + str(pvalueanova1))
    print(" "*8 + "ANOVA F-value = " + str(fvalueanova2))
    print(" "*8 + "ANOVA p-value = " + str(pvalueanova2))
    print("--- Correlation : Class Size <> Issue")
    print(" "*8 + "Spearman rho correlation coefficient = " + str(spearmanGlobalCorrelationCoefficient2))
    print(" "*8 + "Spearman p-value = " + str(spearmanpvalue2Global))
    print(" "*8 + "Kendall Tau = " + str(kendalltauGlobalCorrelationCoefficient2))
    print(" "*8 + "Kendall p-value = " + str(kendalltaupvalue2Global))


    print("_"*80)
    print("_"*80)
    print("-- GLOBAL OLS --")
    y = issueForGlobalModel
    X = np.array([callGraphForGlobalModel,classSizeForGlobalModel]).transpose()
    X = list([list(i) for i in X])
    X = sm.add_constant(X,prepend=False)
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary(yname="issues", xname =("APIcalls", "ClassSize", "const")))


    print("API CALLS only")
    X = callGraphForGlobalModel
    X = sm.add_constant(X,prepend=False)
    model2 = sm.OLS(y, X)
    results = model2.fit()
    print(results.summary(yname="issues",xname =["APIcalls","const"]))
    print("Size only")
    X = classSizeForGlobalModel
    X = sm.add_constant(X,prepend=False)
    model3 = sm.OLS(y, X)
    results = model3.fit()
    print(results.summary(yname="issues",xname =["ClassSize","const"]))
Example #10
0
def run_densityEstimation(
        functionName,
        method,
        kfold=20,
        numDims=2,
        numSamples=1000,
        candidates="join",
        bandwidthOptimizationType=BandwidthOptimizationType_SILVERMANSRULE,
        out=True,
        plot=False,
        tikz=False):
    if method == "sgde_zero":
        interpolation = "zero"
    else:  # interpolation == "boundaries":
        interpolation = "boundaries"

    samples, bounds, natafType = load_data_set(functionName, numSamples,
                                               numDims)

    # do kfold cross validation
    crossEntropyValidation = np.zeros((kfold, 2))
    learnSamples, validationSamples = splitset(samples, splitPercentage=0.7)

    stats = {}
    for i in range(kfold):
        print("=" * 100)
        print("run (%s)= %i/%i" % (method, i + 1, kfold))
        print("=" * 100)
        print("valid: %i x %i (mean=%g, var=%g)" %
              (validationSamples.shape[0], validationSamples.shape[1],
               np.mean(validationSamples), np.var(validationSamples)))

        np.random.seed(i * 123456 + i % 2)
        trainSamples, testSamples = splitset(learnSamples,
                                             splitPercentage=1. - 1. / kfold)

        if "sgde" in method:
            dist, stats[i] = estimateSGDEDensity(functionName,
                                                 trainSamples,
                                                 testSamples,
                                                 bounds=bounds,
                                                 iteration=i,
                                                 plot=plot,
                                                 label=method,
                                                 out=out,
                                                 candidates=candidates,
                                                 interpolation=interpolation)
        elif "kde" in method:
            dist, stats[i] = estimateKDEDensity(
                functionName,
                trainSamples,
                testSamples,
                iteration=i,
                plot=plot,
                label=method,
                out=out,
                bandwidthOptimizationTypeStr=bandwidthOptimizationType)
        elif "nataf" in method:
            # estimate nataf density
            dist, stats[i] = estimateNatafDensity(functionName,
                                                  natafType,
                                                  testSamples,
                                                  iteration=i,
                                                  bounds=bounds,
                                                  plot=plot,
                                                  label=method,
                                                  out=out)
        else:
            raise AttributeError("unknown config '%s'" % method)

        # evaluate the distribution according to the validation set
        crossEntropyValidation[i, 0] = i
        crossEntropyValidation[i, 1] = dist.crossEntropy(validationSamples)
        stats[i]["crossEntropyValidation"] = dist.crossEntropy(
            validationSamples)
        stats[i]["validationSamples"] = validationSamples
        stats[i]["samples"] = {"shuffled": {}, "not_shuffled": {}}
        stats[i]["samples"]["shuffled"]["rvs"] = dist.rvs(numSamples,
                                                          shuffle=True)
        stats[i]["samples"]["shuffled"]["uniform_validation"] = dist.cdf(
            validationSamples, shuffle=True)
        kstests = [None] * numDims

        for idim in range(numDims):
            samples1d = stats[i]["samples"]["shuffled"][
                "uniform_validation"][:, idim]
            res_test = kstest(samples1d, Uniform(0, 1).cdf)
            kstests[idim] = res_test.statistic, res_test.pvalue
            if plot:
                plt.figure()
                plt.hist(samples1d, cumulative=True, normed=True)
                xs = np.linspace(0, 1, 10)
                plt.plot(xs, [Uniform(0, 1).cdf(xi) for xi in xs])
                plt.title("shuffled: %i, %s" % (idim, kstests[idim]))
        print("-" * 80)
        print("shuffled    ", kstests, np.min(kstests), np.max(kstests))
        if plot:
            plt.show()

        stats[i]["samples"]["shuffled"]["kstests"] = kstests
        stats[i]["samples"]["not_shuffled"]["rvs"] = dist.rvs(numSamples,
                                                              shuffle=False)
        stats[i]["samples"]["not_shuffled"]["uniform_validation"] = dist.cdf(
            validationSamples, shuffle=False)
        kstests = [None] * numDims
        for idim in range(numDims):
            samples1d = stats[i]["samples"]["not_shuffled"][
                "uniform_validation"][:, idim]
            res_test = kstest(samples1d, Uniform(0, 1).cdf)
            kstests[idim] = res_test.statistic, res_test.pvalue
            if plot:
                plt.figure()
                plt.hist(samples1d, cumulative=True, normed=True)
                xs = np.linspace(0, 1, 1000)
                plt.plot(xs, [Uniform(0, 1).cdf(xi) for xi in xs])
                plt.title("not shuffled: %i, %s" % (idim, kstests[idim]))
        print("not shuffled", kstests, np.min(kstests), np.max(kstests))
        if plot:
            plt.show()

        stats[i]["samples"]["not_shuffled"]["kstests"] = kstests

        print("CV valid = %g" % crossEntropyValidation[i, 1])

        # write results to file
        if out:
            out_crossEntropy = os.path.join(
                "data", method, "%s.%s.validation.cross_entropies.csv" %
                (method, functionName))
            np.savetxt(out_crossEntropy, crossEntropyValidation[:i, :])

            # save stats to pickle
            out_stats = os.path.join(
                "data", method,
                "%s.%s.best.stats.pkl" % (method, functionName))
            fd = open(out_stats, "w")
            pkl.dump(stats, fd)
            fd.close()
#generate sample die roll (100 times) data for 12-sided die
rolls = randint(1, 13, 100)
print(rolls)
print(mean(rolls))

#demonstration of central limit theorem
# seed the random number generator
seed(1)

# calculate the mean of 100 dice rolls 1000 times
means = [mean(randint(1, 13, 100)) for _ in range(1000)]
# plot the distribution of sample means
pyplot.hist(means)
plt.xlabel("Frequency")
plt.ylabel("Probability Density")
plt.title("Histogram plot for 100 rolls 12-sided die")
pyplot.show()

#Z-score KS test
stats.kstest(stats.zscore(means), "norm")

#Shapiro-Wilk normailty test
from scipy.stats import shapiro
stat, p = shapiro(means)
print('Statistics={}, p={}'.format(stat, p))
alpha = 0.05
if p > alpha:
    print('Sample looks Normal so we do not reject H0')
else:
    print('Sample does not look Normal so we reject H0')
Example #12
0
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import numpy as np
import scipy.optimize
from astropy.modeling import models, fitting
from scipy.stats import norm, stats

plt.clf()
f, (ax1, ax2, ax3, ax4) = plt.subplots(4, sharex=True, sharey=True)

## load data
fluxes = np.loadtxt("fluxes.dat")

# - make statistics p-value test
ptest = scipy.stats.mstats.normaltest(fluxes)
kstest = stats.kstest(fluxes, 'norm')

print(ptest)
print(kstest)

## Following  http://stackoverflow.com/questions/7805552/fitting-a-histogram-with-python

bins = np.linspace(0, 1, 40)
n, bins, patches = ax1.hist(fluxes,
                            bins,
                            normed=1,
                            facecolor='green',
                            alpha=0.75)
(mu, sigma) = norm.fit(fluxes)
y = mlab.normpdf(bins, mu, sigma)
l = ax1.plot(bins, y, 'r--', linewidth=2, label='mlab.normpdf')
Example #13
0
    warnings.simplefilter('ignore')

    for dist_name in dist_names:
        dist = getattr(scipy.stats, dist_name)
        param = dist.fit(df['mean_travel_time'])
        pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2],
                              scale=param[-1]) * size
        plt.plot(pdf_fitted, label=dist_name)
        plt.xlabel("Trip Duration (Minutes)")
        plt.ylabel("Frequency")
        plt.xlim(0, 50)
        plt.ylim(0, 8000)

        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = stats.kstest(df['mean_travel_time'], dist_name, args=param)
        print("p value for " + dist_name + " = " + str(p))
        print("D value for " + dist_name + " = " + str(D) + "\n")
        dist_results.append((dist_name, p))
        dist_resultsD.append((dist_name, D))

    plt.legend(loc='upper right')
    plt.show()

    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))

    if best_p < 0.001:
        best_dist, best_D = (min(dist_resultsD, key=lambda item: item[1]))
    # store the name of the best fit and its p value

    print("Best fitting distribution: " + str(best_dist))
Example #14
0
def main():

    finaldatafile = "finaldata.json"
    finalData = None
    try:
        with open(finaldatafile) as data_file:
            finalData = json.load(data_file)
    except:
        print("Run analysis")
        exit()

    for appliName in finalData:
        cgscore, issuescore, classSize = finalData[appliName]
        j = 0
        issueCallgraphValueForStats = []
        callGraphValueForStats = []

        issueSizeValueForStats = []
        classSizeValueForStats = []

        issueForModel = []
        callGraphForModel = []
        classSizeForModel = []

        for key in issuescore:
            if key in cgscore:
                j += 1
                issueCallgraphValueForStats.append(issuescore[key])
                callGraphValueForStats.append(cgscore[key])

        for key in issuescore:
            if key in classSize:
                issueSizeValueForStats.append(issuescore[key])
                classSizeValueForStats.append(classSize[key])

        for key in issuescore:
            if key in classSize:
                if key in cgscore:
                    issueForModel.append(issuescore[key])
                    callGraphForModel.append(cgscore[key])
                    classSizeForModel.append(classSize[key])

        if j > 3:
            spearmanCorrelationCoefficient, spearmanpvalue = spearmanr(
                issueCallgraphValueForStats, callGraphValueForStats)
            kendalltauCorrelationCoefficient, kendalltaupvalue = kendalltau(
                issueCallgraphValueForStats, callGraphValueForStats)
            kstestdissueValueForStats, kstestpvalueissueValueForStats = kstest(
                [issuescore[key] for key in issuescore], "norm")
            kstestdcgValueForGraph, kstestpvaluecgValueForGraph = kstest(
                [cgscore[key] for key in cgscore], "norm")

            spearmanCorrelationCoefficient2, spearmanpvalue2 = spearmanr(
                issueSizeValueForStats, classSizeValueForStats)
            kendalltauCorrelationCoefficient2, kendalltaupvalue2 = kendalltau(
                issueSizeValueForStats, classSizeValueForStats)
            kstestdchissueSizeValueForStats, kstestpvaluechissueSizeValueForStats = kstest(
                [issuescore[key] for key in issuescore], "norm")
            kstestdclassSizeValueForStats, kstestpvalueclassSizeValueForStats = kstest(
                [classSize[key] for key in classSize], "norm")

            print(appliName)
            print("--- API Call <> Issue")
            print(" " * 8 + "Spearman rho correlation coefficient = " +
                  str(spearmanCorrelationCoefficient))
            print(" " * 8 + "Spearman p-value = " + str(spearmanpvalue))
            print(" " * 8 + "Kendall Tau = " +
                  str(kendalltauCorrelationCoefficient))
            print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalue))
            print(" " * 8 + "KS Test D = " + str(kstestdissueValueForStats))
            print(" " * 8 + "KS p-value = " +
                  str(kstestpvalueissueValueForStats))
            print(" " * 8 + "KS Test D = " + str(kstestdcgValueForGraph))
            print(" " * 8 + "KS p-value = " + str(kstestpvaluecgValueForGraph))
            print(" " * 8 + "dataset size =" + str(j))
            print("--- Class Size <> Issue")
            print(" " * 8 + "Spearman rho correlation coefficient = " +
                  str(spearmanCorrelationCoefficient2))
            print(" " * 8 + "Spearman p-value = " + str(spearmanpvalue2))
            print(" " * 8 + "Kendall Tau = " +
                  str(kendalltauCorrelationCoefficient2))
            print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalue2))
            print(" " * 8 + "KS Test D = " +
                  str(kstestdchissueSizeValueForStats))
            print(" " * 8 + "KS p-value = " +
                  str(kstestpvaluechissueSizeValueForStats))
            print(" " * 8 + "KS Test D = " +
                  str(kstestdclassSizeValueForStats))
            print(" " * 8 + "KS p-value = " +
                  str(kstestpvalueclassSizeValueForStats))

            y = issueForModel
            X = np.array([callGraphForModel, classSizeForModel]).transpose()
            X = list([list(i) for i in X])
            model = sm.OLS(y, X)
            results = model.fit()
            print(
                results.summary(yname="issues",
                                xname=("APIcalls", "ClassSize")))

        else:
            print("FAILURE : " + appliName)

    print("|" * 80)
    print("-" * 80)
    print("-" * 80)
    print("|" * 80)

    issueForGlobalModel = []
    callGraphForGlobalModel = []
    classSizeForGlobalModel = []
    issueGlobalCallgraphValueForStats = []
    callGlobalGraphValueForStats = []
    NOissueGlobalCallgraphValueForStats = []
    issueGlobalSizeValueForStats = []
    classGlobalSizeValueForStats = []

    anova1issue = []
    anova2issue = []
    for appliName in finalData:
        cgscore, issuescore, classSize = finalData[appliName]
        for key in issuescore:
            if key in classSize:
                if key in cgscore:
                    issueForGlobalModel.append(issuescore[key])
                    callGraphForGlobalModel.append(cgscore[key])
                    classSizeForGlobalModel.append(issuescore[key])

        for key in issuescore:
            if key in cgscore:
                j += 1
                issueGlobalCallgraphValueForStats.append(issuescore[key])
                callGlobalGraphValueForStats.append(cgscore[key])
            else:
                NOissueGlobalCallgraphValueForStats.append(issuescore[key])

        for key in cgscore:
            if key in issuescore:
                anova1issue.append(cgscore[key])
            else:
                anova2issue.append(cgscore[key])

        for key in issuescore:
            if key in classSize:
                issueGlobalSizeValueForStats.append(issuescore[key])
                classGlobalSizeValueForStats.append(classSize[key])

    spearmanGlobalCorrelationCoefficient, spearmanpvalueGlobal = spearmanr(
        issueGlobalCallgraphValueForStats, callGlobalGraphValueForStats)
    kendalltauGlobalCorrelationCoefficient, kendalltaupvalueGlobal = kendalltau(
        issueGlobalCallgraphValueForStats, callGlobalGraphValueForStats)

    spearmanGlobalCorrelationCoefficient2, spearmanpvalue2Global = spearmanr(
        issueGlobalSizeValueForStats, classGlobalSizeValueForStats)
    kendalltauGlobalCorrelationCoefficient2, kendalltaupvalue2Global = kendalltau(
        issueGlobalSizeValueForStats, classGlobalSizeValueForStats)

    fvalueanova1, pvalueanova1 = f_oneway(issueGlobalCallgraphValueForStats,
                                          NOissueGlobalCallgraphValueForStats)

    fvalueanova2, pvalueanova2 = f_oneway(anova1issue, anova2issue)

    print(len(NOissueGlobalCallgraphValueForStats))
    print("--- Correlation : API Call <> Issue")
    print(" " * 8 + "Spearman rho correlation coefficient = " +
          str(spearmanGlobalCorrelationCoefficient))
    print(" " * 8 + "Spearman p-value = " + str(spearmanpvalueGlobal))
    print(" " * 8 + "Kendall Tau = " +
          str(kendalltauGlobalCorrelationCoefficient))
    print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalueGlobal))
    print(" " * 8 + "ANOVA F-value = " + str(fvalueanova1))
    print(" " * 8 + "ANOVA p-value = " + str(pvalueanova1))
    print(" " * 8 + "ANOVA F-value = " + str(fvalueanova2))
    print(" " * 8 + "ANOVA p-value = " + str(pvalueanova2))
    print("--- Correlation : Class Size <> Issue")
    print(" " * 8 + "Spearman rho correlation coefficient = " +
          str(spearmanGlobalCorrelationCoefficient2))
    print(" " * 8 + "Spearman p-value = " + str(spearmanpvalue2Global))
    print(" " * 8 + "Kendall Tau = " +
          str(kendalltauGlobalCorrelationCoefficient2))
    print(" " * 8 + "Kendall p-value = " + str(kendalltaupvalue2Global))

    print("_" * 80)
    print("_" * 80)
    print("-- GLOBAL OLS --")
    y = issueForGlobalModel
    X = np.array([callGraphForGlobalModel,
                  classSizeForGlobalModel]).transpose()
    X = list([list(i) for i in X])
    X = sm.add_constant(X, prepend=False)
    model = sm.OLS(y, X)
    results = model.fit()
    print(
        results.summary(yname="issues",
                        xname=("APIcalls", "ClassSize", "const")))

    print("API CALLS only")
    X = callGraphForGlobalModel
    X = sm.add_constant(X, prepend=False)
    model2 = sm.OLS(y, X)
    results = model2.fit()
    print(results.summary(yname="issues", xname=["APIcalls", "const"]))
    print("Size only")
    X = classSizeForGlobalModel
    X = sm.add_constant(X, prepend=False)
    model3 = sm.OLS(y, X)
    results = model3.fit()
    print(results.summary(yname="issues", xname=["ClassSize", "const"]))
 def pvalue(self):
     a, b = self.bounds 
     normalized = (self.samples - a) / (b - a)
     K, pvalue = kstest(normalized, 'uniform') #@UnusedVariable
     return pvalue
Example #16
0
x = np.random.normal(mu, sigma, 100)
print(mean(x))
pyplot.hist(x)
pyplot.show()
norm=[np.mean(np.random.normal(mu, sigma, 100)) for _i in range(1000)]
print(norm)

#histogram plot
pyplot.hist(norm)
plt.xlabel("Frequency")
plt.ylabel("Probability Density")
plt.title ("Histogram plot for 100 samples from Random Normal Distribution")
pyplot.show()

#Z-score KS test 
stats.kstest(stats.zscore(norm), "norm")


#poisson
seed (2)
t=np.random.poisson(lam=3,size=(100))
print(mean(t))
pyplot.hist(t)
plt.xlabel("Frequency")
plt.ylabel("Probability Density")
plt.title ("Histogram plot for Random Poisson Distribution of 100 samples (lambda=3)")
pyplot.show()

s = [np.mean(np.random.poisson(3,100)) for _i in range(1000)]
pyplot.hist(s)
plt.xlabel("Frequency")
Example #17
0
import numpy as np
from numpy import var, std
from scipy.stats import stats
from statistic.check.util import s_2, sigma_2

__author__ = 'zzt'

if __name__ == '__main__':
    l = [420, 500, 920, 1380, 1510, 1650, 1760, 2100, 2300, 2350]
    print(stats.kstest(l, 'expon', [1500.0]))
    x = np.linspace(-15, 15, 9)
    print(std(x)**2)
    print(var(x))
    print(s_2(x))
    print(sigma_2(x))
    print(stats.kstest(x, 'norm', [0, 9]))
Example #18
0
def def_kstest(rvs1, cdf1, args1=(), alternative1='two-sided'):
    res = kstest(rvs=rvs1, cdf=cdf1, args=args1, alternative=alternative1)
    return res
Example #19
0
params = {}

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
   
    for dist_name in dist_names:
        dist = getattr(scipy.stats, dist_name)
        param = dist.fit(df['count'])
        pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1]) * size
        plt.plot(pdf_fitted, label=dist_name)
        plt.xlim(0,50)
        plt.ylim(0,500)
        
        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = stats.kstest(df['count'], dist_name, args=param)
        print("p value for "+dist_name+" = "+str(p))
        dist_results.append((dist_name, p))
        
    plt.legend(loc='upper right')
    plt.show()
    
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value
    
    print("Best fitting distribution: "+str(best_dist))
    print("Best p value: "+ str(best_p))
    print("Parameters for the best fit: "+ str(params[best_dist]))