Example #1
0
    def check_significance(gt_dist, seg_dist):

        normal_gt = normaltest(gt_dist)[1]
        normal_seg = normaltest(seg_dist)[1]

        # if both distributions are parametric use t-test, else use mann-whitney-u
        if normal_gt > 0.05 and normal_seg > 0.05:

            pvalue = ttest_ind(gt_dist, seg_dist)[1]

        else:
            pvalue = mannwhitneyu(gt_dist, seg_dist)[1]


        if pvalue > 0.05:
            return 0

        if 0.01 < pvalue < 0.05:
            return 1

        if 0.001 < pvalue < 0.01:
            return 2

        if pvalue < 0.001:
            return 3

        return pvalue
Example #2
0
def same_mean(series_1, series_2, significance):
    """ Check the variance and distribution and then make hypothesis test for the mean

    The variance and normal distribution test is needed to check whether a t-test could
    be used, since these two requirements are needed for the t-test. If these requirements
    are not met, the Mann-Whitney-Wilcoxon RankSum test is used

    :param series1: Pandas Series for first attribute
    :type series_1: pandas.Series
    :param series2: Pandas Series for second attribute
    :type series_2: pandas.Series
    :param significance: Test significance (normally 5%)
    :type significance: float
    :rtype: bool
    """
    normaltest_series_1 = normaltest(series_1)
    normaltest_series_2 = normaltest(series_2)
    if series_1.var() == series_2.var() and\
        normaltest_series_1[1] <= significance and\
            normaltest_series_2[1] <= significance:
        result, p_value = stats.ttest_ind(series_1, series_2)
    else:
        result, p_value = stats.ranksums(series_1, series_2)
    # A small p value means the probability that values like the ones occur given that
    # both series have the same mean is small -> They don't have the same mean
    if p_value <= significance:
        return False, result, p_value
    else:
        return True, result, p_value
Example #3
0
def explore_city_data(city_data):
    """Calculate the Boston housing statistics."""

    # Get the labels and features from the housing data
    housing_prices = city_data.target
    housing_features = city_data.data

    ###################################
    ### Step 1. YOUR CODE GOES HERE ###
    ###################################
    # Please calculate the following values using the Numpy library
    # Size of data (number of houses)?
    houses_dims = housing_features.shape
    print "The number of houses is " + str(houses_dims[0])
    # Number of features?
    print "The number of features is " + str(houses_dims[1])
    # Minimum price?
    print "The minimum price of a house is " + str(housing_prices.min()) + " thousand."
    # Maximum price?
    print "The maximum price of a house is " + str(housing_prices.max()) + " thousand."
    # Calculate mean price?
    print "The average price of a house is " + str(round(housing_prices.mean(),1)) + " thousand."
    # Calculate median price?
    print "The median price of a house is " + str(np.median(housing_prices)) + " thousand."
    # Calculate standard deviation?
    print "The standard deviation of housing prices is " + str(round(housing_prices.std(),1)) + " thousand."
    q75,q25 = np.percentile(housing_prices,[75,25])
    print "The IQR deviation of housing prices is " + str(q75-q25)
    print normaltest(housing_prices)
    plt.hist(housing_prices)
    plt.title("Boston Housing Prices")
    plt.xlabel("Prices in $1000s")
    plt.ylabel("Frequency")
Example #4
0
 def test_axis_None(self):
     # Test axis=None (equal to axis=0 for 1-D input)
     x = np.array((-2,-1,0,1,2,3)*4)**2
     assert_allclose(mstats.normaltest(x, axis=None), mstats.normaltest(x))
     assert_allclose(mstats.skewtest(x, axis=None), mstats.skewtest(x))
     assert_allclose(mstats.kurtosistest(x, axis=None),
                     mstats.kurtosistest(x))
Example #5
0
 def test_axis_None(self):
     # Test axis=None (equal to axis=0 for 1-D input)
     x = np.array((-2,-1,0,1,2,3)*4)**2
     assert_allclose(mstats.normaltest(x, axis=None), mstats.normaltest(x))
     assert_allclose(mstats.skewtest(x, axis=None), mstats.skewtest(x))
     assert_allclose(mstats.kurtosistest(x, axis=None),
                     mstats.kurtosistest(x))
Example #6
0
def normality():
    data_dropped_na = data.dropna()
    column_name = "D’Agostino and Pearson’s Normality Test"

    formula = var_formula.get()
    if formula == '':
        print_status('Warning: Please, specify column names in formula.',
                     'red')
        return
    x_list = formula.split('~')[0].split('+')
    y = None
    try:
        y = formula.split('~')[1]
    except:
        pass

    test_list = []
    p_value_list = []
    index_list = []

    for x in x_list:
        if x not in data_dropped_na.columns:
            print_status("Warning: No such continuous column.", 'red')
            return
        if y is not None and y not in data_dropped_na.columns:
            print_status('Warning: No such categorical column.', 'red')
            return
        if y is None:
            test, p_value = normaltest(data_dropped_na[x])
            test_list.append(test)
            p_value_list.append(p_value)
            index_list.append(x)
        else:
            for i in set(data_dropped_na[y]):
                test, p_value = normaltest(
                    data_dropped_na[data_dropped_na[y] == i][x])

                test_list.append(test)
                p_value_list.append(p_value)
                index_list.append(x + '[' + i + ']')

    df = pd.DataFrame({
        column_name: test_list,
        "p Value": p_value_list
    },
                      index=index_list)
    writer = pd.ExcelWriter('../../Analysis/Normality.xlsx')
    df.to_excel(writer, sheet_name='Sheet1', startcol=1)
    # df.to_excel(writer, sheet_name='Sheet1', startcol=7)
    writer.save()
    print_status('Status: Normality test performed', 'black')
    os.startfile('../../Analysis\\Normality.xlsx')
Example #7
0
def compare_best_to_baseline(X_train, y_train, X_test, y_test, base_estimator,
                             best_estimator):
    ###Compare the baseline model to the best predictor on the test set
    #Use baseline model to get CV data on test set
    random.seed = 1
    n_test = X_test.shape[0]
    X_test_base = X_test.loc[:, 'ln_cum_char'].reshape((n_test, 1))
    baseline_test_scores = cross_validation.cross_val_score(
        base_estimator,
        X_test_base,
        y_test,
        scoring='mean_squared_error',
        cv=10)

    #Use best model to get CV data on test set
    feature_list = ('ln_cum_char', 'percent_seen', 'mean_days_since',
                    'mean_term_freq', 'norm_t1', 'norm_t2', 'norm_t3')  #
    X_test_sub = X_test.loc[:, feature_list]
    best_test_scores = cross_validation.cross_val_score(
        best_estimator,
        X_test_sub,
        y_test,
        scoring='mean_squared_error',
        cv=10)

    #Calculate statistics to compare samples from baseline and best model
    p_base_normality = normaltest(baseline_test_scores)[1]
    p_best_normality = normaltest(best_test_scores)[1]
    corr_p_value = pearsonr(baseline_test_scores, best_test_scores)
    t_P_value = ttest_ind(baseline_test_scores, best_test_scores)[1]

    print "Normality test for baseline CV MSE gives a p-value of %0.4f" % p_base_normality
    print "Normality test for best model's CV MSE gives a p-value of %0.4f" % p_best_normality
    print '''The Pearson correlation coefficient between the baseline and best model
    scores is %0.4F, and the correlation p-value is %0.4F''' % (
        corr_p_value[0], corr_p_value[1])
    print "t-test for independece between baseline and best model gives a p-value of %0.4f" % t_P_value

    y_test_base = base_estimator.predict(
        X_test_base)  #Estimate y with model created from training set
    MSE_base = mean_squared_error(
        y_test, y_test_base)  #MSE on test for model based on training set
    print "The non-CV MSE for the baseline is %0.4f" % MSE_base

    #Best MSE on test set
    y_test_best = best_estimator.predict(X_test_sub)
    MSE_best = mean_squared_error(y_test, y_test_best)
    print "The non-CV MSE for the best model is %0.4f" % MSE_best

    return (y_test_base, y_test_best)
Example #8
0
def plot_effort_density(data, efforts):
    a = []
    for class_name, effort in efforts.items():
        class_data = list(filter(lambda c: c['parent'] == class_name, data))[0]
        a.append((float(class_data['normalized_density']), effort))
    x, y = zip(*a)
    print("Normal test normalized density:", mst.normaltest(x))
    print("Normal test effort:", mst.normaltest(y))
    print("[Pearson]", st.pearsonr(x, y))
    print("[Spearman]", st.spearmanr(x, y))
    plt.scatter(x, y)
    plt.xlabel('Normalized density')
    plt.ylabel('Average wasted effort')
    plt.title('Normalized density vs. average wasted effort')
    plt.grid(True)
    plt.show()
Example #9
0
def isNormalDistribution(df, alpha, shapiro=True):

    print "\nChecking if the columns follow a normal distribution by d'Agostino & Pearson or Shpapiro test...\n"
    #list of column except the "quality"
    h = list(df.columns.values)
    count = 0
    for i in h:

        u, v = ss.shapiro(df[i])
        k, p = mstats.normaltest(df[i])

        if (shapiro):
            if v < alpha:
                print "   The null hypothesis can be rejected; Column: ", i, "\n"
                count += 1
            else:
                print "   The null hypothesis can not be rejected; Column: ", i, "\n"

        else:
            if p < alpha:
                print "   The null hypothesis can be rejected; Column: ", i, "\n"
                count += 1
            else:
                print "   The null hypothesis can not be rejected; Column: ", i, "\n"
        if count == len(h):
            print "\n\n   Any column follows a normal distribution\n"
Example #10
0
def spearman_with_errors(x, y, yerr, Nmc=1000, plotflag=False, verbose=False):
    ysim = np.zeros(Nmc, 'f')
    rhosim = np.zeros(Nmc, 'f')
    psim = np.zeros(Nmc, 'f')

    for i in range(Nmc):
        ysim = np.random.normal(y, scale=yerr, size=len(y))
        rhosim[i], psim[i] = spearmanr(x, ysim)
    cave = np.mean(rhosim)
    cstd = np.std(rhosim)
    q1 = 50 - 34  # mean minus one std
    lower = np.percentile(rhosim, q1)
    q2 = 50 + 34  # mean minus one std
    upper = np.percentile(rhosim, q2)
    print 'mean (median) = %5.2f (%5.2f), std = %5.2f' % (
        cave, np.median(rhosim), cstd)
    print 'confidence interval from sorted list of MC fit values:'
    print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)' % (lower, cave - cstd,
                                                            upper, cave + cstd)
    k, pnorm = normaltest(rhosim)
    print 'probability that distribution of slopes is normal = %5.2f' % (pnorm)
    if plotflag:
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        plt.hist(rhosim, bins=10, normed=True)
        plt.xlabel(r'$Spearman \ \rho $')
        plt.axvline(x=cave, ls='-', color='k')
        plt.axvline(x=lower, ls='--', color='k')
        plt.axvline(x=upper, ls='--', color='k')
        plt.subplot(1, 2, 2)
        plt.hist(np.log10(psim), bins=10, normed=True)
        plt.xlabel(r'$\log_{10}(p \ value)$')
    return rhosim, psim
Example #11
0
def get_stationarity_statistics(df):
    """
    returns a list of stationary statistics for the dataframe being passed
    :param df: 
    :return: 
    """
    # verify stationarity
    adfstat, pvalue, critvalues, resstore = adfuller(df,
                                                     regression="nc",
                                                     store=True,
                                                     regresults=True)

    # D’Agostino and Pearson normality test of returns
    dagostino_results = normaltest(df)

    # Shapiro-Wilk normality test
    shapiro_results = shapiro(df)

    # Kolmogorov-Smirnov normality test
    ks_results = kstest(df, cdf='norm')

    # Anderson-Darling normality test
    anderson_results = anderson(df)

    # Kwiatkowski-Phillips-Schmidt-Shin normality test
    kpss_results = KPSS(df)

    return adfstat, pvalue, critvalues, resstore, dagostino_results, shapiro_results, ks_results, anderson_results, kpss_results
Example #12
0
def testNormality():
	ks,ps = [],[]
	for name in names:
		langData = dataDict[name].values
		k,p = normaltest(langData)
		ks.append(k), ps.append(p)
	return ks,ps
Example #13
0
 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
     xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
Example #14
0
def spearman_with_errors(x,y,yerr,Nmc=1000,plotflag=False,verbose=False):
    ysim=np.zeros(Nmc,'f')
    rhosim=np.zeros(Nmc,'f')
    psim=np.zeros(Nmc,'f')

    for i in range(Nmc):
        ysim=np.random.normal(y,scale=yerr,size=len(y))
        rhosim[i],psim[i] = spearmanr(x,ysim)
    cave=np.mean(rhosim)
    cstd=np.std(rhosim)
    q1=50-34 # mean minus one std
    lower=np.percentile(rhosim,q1)
    q2=50+34 # mean minus one std
    upper=np.percentile(rhosim,q2)
    print 'mean (median) = %5.2f (%5.2f), std = %5.2f'%(cave,np.median(rhosim),cstd)
    print 'confidence interval from sorted list of MC fit values:'
    print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)'%(lower,cave-cstd, upper,cave+cstd)
    k,pnorm=normaltest(rhosim)
    print 'probability that distribution of slopes is normal = %5.2f'%(pnorm)
    if plotflag:
        plt.figure(figsize=(10,4))
        plt.subplot(1,2,1)
        plt.hist(rhosim,bins=10,normed=True)
        plt.xlabel(r'$Spearman \ \rho $')
        plt.axvline(x=cave,ls='-',color='k')
        plt.axvline(x=lower,ls='--',color='k')
        plt.axvline(x=upper,ls='--',color='k')
        plt.subplot(1,2,2)
        plt.hist(np.log10(psim),bins=10,normed=True)
        plt.xlabel(r'$\log_{10}(p \ value)$')
    return rhosim,psim
Example #15
0
def plot_effort_num_of_components(data, efforts):
    a = []
    for class_name, effort in efforts.items():
        class_data = list(filter(lambda x: x['parent'] == class_name, data))[0]
        a.append((float(class_data['number_of_components']), effort))
    x, y = zip(*a)
    print("Normal test number_of_components:", mst.normaltest(x))
    print("Normal test effort:", mst.normaltest(y))
    print("[Pearson]", st.pearsonr(x, y))
    print("[Spearman]", st.spearmanr(x, y))
    plt.scatter(x, y)
    plt.xlabel('number_of_components')
    plt.ylabel('Average wasted effort')
    plt.title('number_of_components vs. average wasted effort')
    plt.grid(True)
    plt.show()
Example #16
0
def normaltest_data(category):
    data, population = load_rating_data(category)
    z, pval = mstats.normaltest(data)
    print(category + " p value is " + str(pval))
    if (pval < 0.01):
        print "Not normal distribution"
    else:
        print "normal"
Example #17
0
def normaltest_data(category):
    data,population = load_rating_data(category)
    z,pval = mstats.normaltest(data)
    print(category+" p value is "+str(pval))
    if(pval < 0.01):
        print "Not normal distribution"
    else:
        print "normal"
Example #18
0
 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2,-1,0,1,2,3)*4)**2
     xm = np.ma.array(np.r_[np.inf, x, 10],
                      mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
Example #19
0
def is_normal(a):
    from scipy.stats import mstats

    #Check for normality
    z, pval = mstats.normaltest(a)
    if (pval < 0.05):
        return False
    else:
        return True
Example #20
0
def is_normal(a):
    from scipy.stats import mstats

    #Check for normality
    z,pval = mstats.normaltest(a)
    if(pval < 0.05):
        return False
    else:
        return True
Example #21
0
def effort_correlation(data, efforts):
    a = []
    for class_name, percentage in efforts.items():
        class_data = list(filter(lambda x: x['parent'] == class_name, data))[0]
        normalized_density = class_data['normalized_density']
        diversity = class_data['diversity']
        uniqueness = class_data['uniqueness']
        if normalized_density and diversity and uniqueness:
            a.append((float(normalized_density), float(diversity), float(uniqueness), percentage))
    nd, d, u, e = zip(*a)
    print("Normal test density:", mst.normaltest(nd))
    print("Normal test diversity:", mst.normaltest(d))
    print("Normal test uniqueness:", mst.normaltest(u))
    print("Normal test effort:", mst.normaltest(e))
    print("[Spearman] density vs. effort", st.spearmanr(nd, e))
    print("[Spearman] diversity vs. effort", st.spearmanr(d, e))
    print("[Spearman] uniqueness vs. effort", st.spearmanr(u, e))
    print("[Pearson] density vs. effort", st.pearsonr(nd, e))
    print("[Pearson] diversity vs. effort", st.pearsonr(d, e))
    print("[Pearson] uniqueness vs. effort", st.pearsonr(u, e))
Example #22
0
def plot_uniqueness_vs_num_of_components(data):
    def transform(tuples):
        return [(float(u), int(c)) for u, c in tuples if u and c]

    uniqueness = _get_column(data, 'uniqueness')
    num_of_components = _get_column(data, 'number_of_tests')
    t = zip(uniqueness, num_of_components)
    u, c = zip(*transform(t))

    print("Normal test uniqueness:", mst.normaltest(u))
    print("Normal test components:", mst.normaltest(c))
    print("[Pearson]", st.pearsonr(u, c))
    print("[Spearman]", st.spearmanr(u, c))

    plt.scatter(u, c)
    plt.xlabel('Uniqueness')
    plt.ylabel('Number of components')
    plt.title('Uniqueness vs. number of components')
    plt.grid(True)
    plt.show()
Example #23
0
    def test_vs_nonmasked(self):
        x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
        assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x))
        assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x))
        assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x))

        funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest]
        mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]
        x = [1, 2, 3, 4]
        for func, mfunc in zip(funcs, mfuncs):
            assert_raises(ValueError, func, x)
            assert_raises(ValueError, mfunc, x)
Example #24
0
def plot_effort_ddu(data, efforts):
    a = []
    for class_name, effort in efforts.items():
        class_data = list(filter(lambda x: x['parent'] == class_name, data))[0]
        a.append((float(class_data['ddu']), effort))
    x, y = zip(*a)
    print("Normal test DDU:", mst.normaltest(x))
    print("Normal test effort:", mst.normaltest(y))
    print("[Pearson]", st.pearsonr(x, y))
    print("[Spearman]", st.spearmanr(x, y))
    plt.scatter(x, y)
    plt.xlabel('DDU')
    plt.ylabel('Average wasted effort')
    plt.title('DDU vs. average wasted effort')
    plt.grid(True)
    plt.xlim(0, 1.0)
    plt.ylim(0, 1.0)
    z = numpy.polyfit(x, y, 1)
    p = numpy.poly1d(z)
    plt.plot(x, p(x), "r-")
    plt.show()
Example #25
0
def plot_uniqueness_and_tests(data):
    """
    Test correlation between uniqueness and number of tests only for classes that have two or more components.
    """
    uniqueness = _get_column(data, 'uniqueness')
    components = _get_column(data, 'number_of_components')
    tests = _get_column(data, 'number_of_tests')
    d = zip(uniqueness, components, tests)
    d = [(float(u), int(c), int(t)) for u, c, t in d if u and c and t]
    d = [(u, c, t) for u, c, t in d if c > 1]
    u, c, t = zip(*d)
    print("Normal test uniqueness:", mst.normaltest(u))
    print("Normal test components:", mst.normaltest(t))
    print("[Pearson]", st.pearsonr(u, t))
    print("[Spearman]", st.spearmanr(u, t))

    plt.scatter(u, t)
    plt.xlabel('Uniqueness')
    plt.ylabel('Number of components')
    plt.title('Uniqueness vs. number of components')
    plt.grid(True)
    plt.show()
def compare_best_to_baseline(X_train, y_train, X_test, y_test, base_estimator, 
                             best_estimator):
    ###Compare the baseline model to the best predictor on the test set
    #Use baseline model to get CV data on test set    
    random.seed = 1
    n_test = X_test.shape[0]
    X_test_base = X_test.loc[:, 'ln_cum_char'].reshape((n_test, 1))
    baseline_test_scores = cross_validation.cross_val_score(base_estimator, 
                    X_test_base, y_test, scoring='mean_squared_error', cv=10)
                                                             
    #Use best model to get CV data on test set
    feature_list = ('ln_cum_char', 'percent_seen', 'mean_days_since', 
                    'mean_term_freq', 'norm_t1', 'norm_t2', 'norm_t3') #
    X_test_sub = X_test.loc[:, feature_list]
    best_test_scores = cross_validation.cross_val_score(best_estimator, 
                    X_test_sub, y_test, scoring='mean_squared_error', cv=10)
    
    #Calculate statistics to compare samples from baseline and best model    
    p_base_normality = normaltest(baseline_test_scores)[1]    
    p_best_normality = normaltest(best_test_scores)[1]       
    corr_p_value = pearsonr(baseline_test_scores, best_test_scores) 
    t_P_value = ttest_ind(baseline_test_scores, best_test_scores)[1]                                             
    
    print "Normality test for baseline CV MSE gives a p-value of %0.4f" % p_base_normality
    print "Normality test for best model's CV MSE gives a p-value of %0.4f" % p_best_normality
    print '''The Pearson correlation coefficient between the baseline and best model
    scores is %0.4F, and the correlation p-value is %0.4F''' % (corr_p_value[0], corr_p_value[1])
    print "t-test for independece between baseline and best model gives a p-value of %0.4f" % t_P_value    
    
    y_test_base = base_estimator.predict(X_test_base) #Estimate y with model created from training set
    MSE_base = mean_squared_error(y_test, y_test_base) #MSE on test for model based on training set
    print "The non-CV MSE for the baseline is %0.4f" % MSE_base
        
    #Best MSE on test set
    y_test_best = best_estimator.predict(X_test_sub)
    MSE_best = mean_squared_error(y_test, y_test_best)
    print "The non-CV MSE for the best model is %0.4f" % MSE_best
        
    return (y_test_base, y_test_best)
Example #27
0
    def test_vs_nonmasked(self):
        x = np.array((-2,-1,0,1,2,3)*4)**2
        assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x))
        assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x))
        assert_array_almost_equal(mstats.kurtosistest(x),
                                  stats.kurtosistest(x))

        funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest]
        mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]
        x = [1, 2, 3, 4]
        for func, mfunc in zip(funcs, mfuncs):
            assert_raises(ValueError, func, x)
            assert_raises(ValueError, mfunc, x)
	def test_normality(self, data):
	    """
	    Tests whether  a sample differs from a normal distribution. Returns a 2-tuple of the chi-squared statistic,
	    and the associated p-value. Given the null hypothesis that x came from a normal distribution,
	    If the p-val is very small (alpha level of 0.05 normally), it means it is unlikely that the data came from a normal distribution.
	    Other possible way: https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.chisquare.html
	    """
	    # equivalent: print stats.normaltest(data)
	    print "z value and p value: "#, z, pval
	    z,pval = mstats.normaltest(data)
	    if(pval < 0.05):
	        print "Not normal distribution"
	    return z, pval
def run_correlation(df):
    global FEATURES
    global P_SIGNIFICANT

    results = []

    for intention in INTENTION_COLUMNS:
        for feature in FEATURES:
            res = {'feature': feature, 'intention': intention}
            group1 = df[df["intent_current_" +
                           intention] == 0][feature].tolist()
            group2 = df[df["intent_current_" +
                           intention] == 1][feature].tolist()

            are_norm = True
            (s, p) = mstats.normaltest(group1)
            are_norm = are_norm and (p > P_SIGNIFICANT)
            (s, p) = mstats.normaltest(group2)
            are_norm = are_norm and (p > P_SIGNIFICANT)
            if are_norm:
                (s, p) = stats.f_oneway(group1, group2)
                res['test'] = 'One-way ANOVA'
                res['statistic'] = s
                res['p'] = p
                res['mean_0'] = np.mean(group1)
                res['mean_1'] = np.mean(group2)
            else:
                (s, p) = mstats.kruskalwallis(group1, group2)
                res['test'] = 'Kruskal-Wallis'
                res['statistic'] = s
                res['p'] = p
                res['mean_0'] = np.mean(group1)
                res['mean_1'] = np.mean(group2)

            results += [res]
    return results
def run_correlation(df, feature, outcome):
    # print("FEATURE",feature,"OUTCOME",outcome)
    # print(len(df.index))
    P_SIGNIFICANT = .05

    outcomes = set(df[outcome].tolist())
    n_outcomes = len(outcomes)
    # print("N OUTCOMES",n_outcomes)

    groups = []
    for oc in outcomes:
        groups += [df[df[outcome] == oc][feature].tolist()]

    are_norm = True
    for g in groups:
        # print(g,len(g))
        (s, p) = mstats.normaltest(g)
        are_norm = are_norm and (p > P_SIGNIFICANT)

    result = {}
    if are_norm:
        if n_outcomes <= 2:
            (s, p) = stats.ttest_ind(groups[0], groups[1])
            result['test'] = 't-test'
        else:
            (s, p) = stats.f_oneway(*groups)
            result['test'] = 'One-way ANOVA'
        result['statistic'] = s
        result['p'] = p
        for (n, g) in zip(range(len(groups)), groups):
            result['mean_%d' % n] = np.mean(g)
    else:
        if n_outcomes <= 2:
            (s, p) = stats.mannwhitneyu(groups[0], groups[1])
            result['test'] = 'Mann-Whitney'
        else:
            # print(len(groups),len(groups[0]))
            (s, p) = mstats.kruskalwallis(*groups)
            result['test'] = 'Kruskal-Wallis'
        result['statistic'] = s
        result['p'] = p
        for (n, g) in zip(range(len(groups)), groups):
            result['mean_%d' % n] = np.mean(g)

    return result
Example #31
0
def plot_distribution_along_axis(X_embedded, X, axes):
    for axis in axes:
        nr_categories = 1
        colors = cm.rainbow(np.linspace(0, 1, nr_categories))
        for category_index, c in zip(range(nr_categories), colors):
            x_projected = []
            for record_embedded, record in zip(X_embedded, X):
                if True:
                    projected = np.dot(axis, record_embedded)
                    x_projected.append(projected)

            hist, bins = np.histogram(x_projected, bins=50)
            width = 0.7 * (bins[1] - bins[0])
            center = (bins[:-1] + bins[1:]) / 2
            plt.bar(center, hist, align='center', width=width)
            popt, pcov = curve_fit(gaus, center, hist, p0=[1.0, 0.0, 1.0])
            plt.plot(center, gaus(center, *popt), color='red', linewidth=2)
            print(normaltest(x_projected))
        plt.show()
Example #32
0
def generate_histogram(df, columns, normality_value):

    #Column reduction
    df_col_reduction = df
    df_col_reduction['red_col'] = df_col_reduction.apply(column_reduction,
                                                         axis=1)

    #Perform normality test
    normality_result = normaltest(df_col_reduction['red_col'])

    similarity_value = normality_result[0]
    pvalue = normality_result[1]

    print("Histogram for the data set.")

    histogram_df = df_col_reduction['red_col']

    fig = plt.figure(figsize=__FIG_SIZE__)
    plt.gcf().clear()

    histogram_df.hist(normed=True)
    histogram_df.plot(kind = 'kde', linewidth = 2, \
                            color = 'r', label = 'Distribution Of Dataset')

    norm_fit = stats.norm.pdf(np.linspace(-3, 3, len(histogram_df)),
                              np.mean(histogram_df), np.std(histogram_df))
    plt.plot(np.linspace(-3, 3, len(histogram_df)),
             norm_fit,
             label="Normal Distribution",
             color='k',
             linewidth=2)  # plot it

    plt.xlabel("Dataset Distribution")
    plt.ylabel("Frequency")

    plt.title("Similarity to normal distribution: " + str(similarity_value) +
              ", pvalue: " + str(pvalue))

    plt.legend()
    plt.show()

    return similarity_value, df_col_reduction
Example #33
0
 def __init__(self, values, stdDevs): 
     new_values = []
     for i in values:
         if i != '':
             try:
                 if "." in i:
                     new_values.append(float(i))
                 else:
                     new_values.append(int(i))
             except:
                 pass #already picked up by error checks
     values = new_values
     super().__init__(values)
     self.stDevOutliers = []
     standardDeviations = Decimal(stdDevs)
     if len(values) >= 8:
         self.pval = mstats.normaltest(array(values))[1]
     else:
         self.pval = 100
     self.min = min(values)
     self.max = max(values)
     self.mean = Decimal(mean(values)).quantize(Decimal('.00000'))
     self.median_low = median_low(values)
     self.median = median(values)
     self.median_high = median_high(values)
     self.stdev = Decimal(stdev(values)).quantize(Decimal('.00'))
     self.normDist = 'No'
     if(self.pval > 0.055):
         self.normDist = 'Yes'
     elif self.pval == 100:
         self.normDist = 'N/A'
     if self.normDist == 'Yes':
         outlier_count = 0
         for x, value in enumerate(values):
             if value < (self.mean - standardDeviations * self.stdev) or \
             value > (self.mean + standardDeviations * self.stdev):  
                 if outlier_count > max_Outliers:
                     self.stDevOutliers = ">%d outliers" % max_Outliers
                     break
                 self.stDevOutliers.append("Row: %d Value: %s" % (x, value))
                 outlier_count += 1
Example #34
0
 def __init__(self, values, stdDevs):
     standardDeviations = stdDevs 
     new_values = []
     for i in values:
         if i != '':
             try:
                 new_values.append(float(i))
             except:
                 pass #already picked up in error checks
     values = new_values
     super().__init__(values)
     self.stDevOutliers = []
     if len(values) >= 8:
         self.pval = mstats.normaltest(array(values))[1]
     else:
         self.pval = 100
     if self.mode != 'N/A':
         self.mode = self.int_to_sci(self.mode)
     self.min = self.int_to_sci(min(values))
     self.max = self.int_to_sci(max(values))
     self.mean = self.int_to_sci(mean(values))
     self.median_low = self.int_to_sci(median_low(values))
     self.median = self.int_to_sci(median(values))
     self.median_high =  self.int_to_sci(median_high(values))
     self.stdev = self.int_to_sci(stdev(values))
     self.normDist = 'No'
     if(self.pval < 0.055):
         self.normDist = 'Yes'
     elif(self.pval == 100):
         self.normDist = 'N/A'
     if self.normDist == 'Yes':
         outlier_count = 0
         for x, value in enumerate(values):
             if value < (float(self.mean) - standardDeviations * float(self.stdev)) or \
             value > (float(self.mean) + standardDeviations * float(self.stdev)):               
                 if outlier_count > max_Outliers:
                     self.stDevOutliers = ">%d outliers" % max_Outliers
                     break
                 self.stDevOutliers.append("Row: %d Value: %s" % (x, value))
                 outlier_count += 1
Example #35
0
    def bivariateNormalTest(self, df):
        # get the data
        n = len(COMMON_COLUMNS)

        # get columns
        a = np.array(df[COMMON_COLUMNS[2]])
        b = np.array(df[COMMON_COLUMNS[3]])
        print(a)
        print(b)
        temp = np.append(a, b)
        print(temp.shape)
        print(normaltest(temp))

        # set up return matrix
        mat = np.empty([n, n])

        # iterate through matrix
        for i in range(n):
            for j in range(n):
                ci = df[COMMON_COLUMNS[i]]
                cj = df[COMMON_COLUMNS[j]]
                temp = pd.DataFrame([ci, cj])
Example #36
0
def test(log):
	stats = statistics(log)

	# for i, stat in enumerate(stats):
	#	if stat:
	#		print "%d \t %.8f \t %.8f \t %s \t %d " %(i+1, stat['rtt_m'], stat['rtt_std'], str(stat['d_rtt_m']) if 'd_rtt_m' in stat else "*" , stat['n'])

	samples = [ s for s in stats if s and 'd_rtt_m' in s and s['d_rtt_m'] != '*' and s['d_rtt_m'] > 0 ]

	samples_rtt = [ s['d_rtt_m'] for s in samples ]

	print "== Test de normalidad ==\n"
	print "p-value = {}\n".format(normaltest(samples_rtt)[1])

	for k in range(len(samples)):
		print "MAX: {}".format(max(samples_rtt))
		G, a, G_crit = grubbs(samples_rtt)

		hop_to = max((s for s in samples if s['d_rtt_m'] in samples_rtt),
				key = lambda s: s['d_rtt_m'])['ip']
		for i in range(len(stats) - 1):
			if stats[i+1] and 'ip' in stats[i+1] and stats[i+1]['ip'] == hop_to:
				hop_from = stats[i]['ip']

		print """
== Test de outliers de Grubbs #{} ==

     G = {}
     a = {}
G_crit = {}

Hop: {} -> {}
""".format(k, G, a, G_crit, hop_from, hop_to)

		if G > G_crit:
			samples_rtt.remove(max(samples_rtt))
		else:
			break
Example #37
0
def are_different(data, factor, metric, threshold = 0.05):
    
    results = []
    tested_values = []
    
    values = data["denormalized"][factor].unique()

    for value in values:
      results.append(data["denormalized"].loc[(data["denormalized"][factor] == value)][metric])
    
    for value, result in zip(values,results):
        print(value, result.mean())
        if mstats.normaltest(result)[1] < 0.05:
            parametric = False

    print()
    if parametric:
      print("Parametric test")
    else:
      print("NON Parametric test")
    print()
        
    for value, result in zip(values,results):
        for value2, result2 in zip(values, results):
            if not value == value2 and value2 not in tested_values:
                tested_values.append(value)
                if not parametric:
                    # z_stat, p_val = wilcoxon(result, result2, zero_method='wilcox', correction=False)
                    z_stat, p_val = ttest_ind(result, result2, equal_var=False)
                else:
                    z_stat, p_val = ttest_ind(result, result2, equal_var=False)
                if p_val < threshold: # 0.05
                    print("Statistically significant different results between %s and %s" 
                          % (value, value2))
                else:
                    print("Statistically NON-significant different results between %s and %s" 
                          % (value, value2))
Example #38
0
def plot_box_resids(fit_model, y_pred, subset=None):
    '''More than you ever wanted to know about your residuals'''
    s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\
               np.var(fit_model.resid)
    if subset:
        s_resid = np.random.choice(s_resid,
                                   replace=False,
                                   size=math.floor(len(s_resid) * subset))
    df = pd.DataFrame(s_resid, columns=['resids'])
    temp_df = pd.DataFrame(y_pred, columns=['target'])
    df = df.join(temp_df)

    if min(y_pred) < -1:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * np.exp(x))))
        y = df['target'].apply(lambda x: np.exp(x))
    else:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * x)))
        y = df['target']

    posit = sorted(df['turnout_bucket'].unique())

    plt.scatter(y, s_resid, alpha=.2)
    slope, intercept = np.polyfit(y, s_resid, 1)
    plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y))
    plt.title('Studentized Residuals vs Prediction')
    plt.xlabel('Predicted Value')
    plt.ylabel('Studentized Residual')
    print 'Slope of best fit line: %s' % slope
    plt.show()

    ax1 = df[['resids', 'turnout_bucket']]\
        .boxplot(by='turnout_bucket', positions=posit, widths=.5)
    plt.title('Residuals versus Turnout')
    plt.xlabel('Turnout Bucket')
    plt.ylabel('Studentized Residuals')
    plt.suptitle('')
    plt.show()

    fig = sm.qqplot(s_resid, line='s')
    plt.title('Q-Q Plot')
    plt.show()

    w, p_val = shapiro(s_resid)
    print 'Shapiro-Wilk P_val is %s, larger the better' % p_val

    k, p_val = normaltest(s_resid)
    print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val

    k, p_val = kstest(s_resid, 'norm')
    print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val

    A, critical, sig = anderson(s_resid)
    print 'Anderson-Darling A2 is %s, smaller the better' % A
    print critical
    print sig

    n, bins, patches = plt.hist(s_resid, 75, normed=1)
    mu = np.mean(s_resid)
    sigma = np.std(s_resid)
    plt.plot(bins, mlab.normpdf(bins, mu, sigma))
    plt.title('Residuals versus a Normal Dist')
    plt.show()

    df['turnout_bucket'].hist(bins=posit, align='left', color='b')
    plt.title('Histogram of Turnout Bucket')
    plt.ylabel('Count')
    plt.xlim(-.5, -.5 + len(posit))

    temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count()
    temp.columns = ['Count']
    plt.show()
    print temp
plt.setp(r1['caps'], color='black',lw=1.5)
plt.setp(r1['medians'], color='black',lw=1.5)

plt.setp(r2['boxes'], color='black',lw=1.5) 
plt.setp(r2['whiskers'], color='black',lw=1.5) 
plt.setp(r2['caps'], color='black',lw=1.5)
plt.setp(r2['medians'], color='black',lw=1.5)
 
ax.set_ylabel('TOTAL EDDY AREA, IN METERS SQUARED')
ax.get_yaxis().set_major_formatter(tkr.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.tight_layout()
plt.savefig(r"C:\workspace\Time_Series\Output\Joes_Figs\grouped_mc_area_boxplot.png",dpi=600)

from scipy.stats.mstats import normaltest, skewtest

print 'old ', normaltest(area_old)
print 'combined ', normaltest(combined)

print 'old ', skewtest(area_old)
print 'combined ', skewtest(combined)

a = probplot(area_old,dist='norm', plot=None)
b= probplot(combined,dist='norm', plot=None)
colors = {'r':'red','s':'blue', 'u':'green'}
markers = {'r':'*','s':'x', 'u':'o'}

old_df = pd.DataFrame(area_old, columns=['Long Term Sites: N=12'])
old_df['Bar_Type'] = lt_bt
old_df = old_df.sort_values(by='Long Term Sites: N=12')
old_df['quart']=a[0][0]
Example #40
0
def plot_box_resids(fit_model, y_pred, subset=None):
    '''More than you ever wanted to know about your residuals'''
    s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\
               np.var(fit_model.resid)
    if subset:
        s_resid = np.random.choice(s_resid,
                                  replace=False,
                                  size=math.floor(len(s_resid) * subset))
    df = pd.DataFrame(s_resid, columns=['resids'])
    temp_df = pd.DataFrame(y_pred, columns=['target'])
    df = df.join(temp_df)

    if min(y_pred) < -1:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * np.exp(x))))
        y = df['target'].apply(lambda x: np.exp(x))
    else:
        df['turnout_bucket'] = df['target']\
        .apply(lambda x: int(math.floor(10 * x)))
        y = df['target']

    posit = sorted(df['turnout_bucket'].unique())

    plt.scatter(y, s_resid, alpha=.2)
    slope, intercept = np.polyfit(y, s_resid, 1)
    plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y))
    plt.title('Studentized Residuals vs Prediction')
    plt.xlabel('Predicted Value')
    plt.ylabel('Studentized Residual')
    print 'Slope of best fit line: %s' % slope
    plt.show()

    ax1 = df[['resids', 'turnout_bucket']]\
        .boxplot(by='turnout_bucket', positions=posit, widths=.5)
    plt.title('Residuals versus Turnout')
    plt.xlabel('Turnout Bucket')
    plt.ylabel('Studentized Residuals')
    plt.suptitle('')
    plt.show()

    fig = sm.qqplot(s_resid, line='s')
    plt.title('Q-Q Plot')
    plt.show()

    w, p_val = shapiro(s_resid)
    print 'Shapiro-Wilk P_val is %s, larger the better' % p_val

    k, p_val = normaltest(s_resid)
    print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val

    k, p_val = kstest(s_resid, 'norm')
    print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val

    A, critical, sig = anderson(s_resid)
    print 'Anderson-Darling A2 is %s, smaller the better' % A
    print critical
    print sig

    n, bins, patches = plt.hist(s_resid, 75, normed=1)
    mu = np.mean(s_resid)
    sigma = np.std(s_resid)
    plt.plot(bins, mlab.normpdf(bins, mu, sigma))
    plt.title('Residuals versus a Normal Dist')
    plt.show()

    df['turnout_bucket'].hist(bins=posit, align='left', color='b')
    plt.title('Histogram of Turnout Bucket')
    plt.ylabel('Count')
    plt.xlim(-.5, - .5 + len(posit))

    temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count()
    temp.columns = ['Count']
    plt.show()
    print temp
Example #41
0
 def test_normaltest_result_attributes(self):
     x = np.array((-2, -1, 0, 1, 2, 3)*4)**2
     res = mstats.normaltest(x)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes, ma=True)
Example #42
0
 def test_normaltest_result_attributes(self):
     x = np.array((-2, -1, 0, 1, 2, 3) * 4)**2
     res = mstats.normaltest(x)
     attributes = ('statistic', 'pvalue')
     check_named_results(res, attributes, ma=True)
def compare(dataset):
   df = pd.read_csv(dataset)
   df_num_rows = len(df.index)
   df_num_cols = len(df.columns)

   # Calculate number of samples to use as an example training set 
   # for which the degree to which it is a normal distribution 
   # will be determined. Must have at least two samples
   if __TRAINING_TEST_SPLIT__ != None: num_samples = max(2, int(df_num_rows * __TRAINING_TEST_SPLIT__)) 

   #print("Starting to compute the degree of match between ")
   #print(" a training and test data sets over ", __NUM_ITERATIONS__, " iteration(s)")
   iter_ctr = 1
   fig_ctr = 1
   for _ in itertools.repeat(None, __NUM_ITERATIONS__): 

     dfsvc_train = df
     if __TRAINING_TEST_SPLIT__ != None:
        # Randomly select num_samples from df
        new_df = df.sample(n=num_samples)
        new_df_num_rows = len(new_df.index)
        new_df_num_cols = len(new_df.columns)
        
        # Extract trainig and test data sets
        dfsvc_train = df.sample(frac = __TRAINING_TEST_SPLIT__)
        dfsvc_test = pd.concat([dfsvc_train, df]).loc[dfsvc_train.index.symmetric_difference(df.index)] 
     
     # Training data
     __PREDICTOR_VARIABLES__ = df.columns[2:]
     X = dfsvc_train[__PREDICTOR_VARIABLES__]
     if __TRAINING_TEST_SPLIT__ != None:
        X_test = dfsvc_test[__PREDICTOR_VARIABLES__] 
     
     # Scale the data set from -1 to 1
     print ("\n\n   Scaling data set between [-1., 1.]" )
     scaler = MinMaxScaler(feature_range = (-1., 1.))
     X_scaled = scaler.fit_transform(X)
     if __TRAINING_TEST_SPLIT__ != None:
        X_test_scaled = scaler.fit_transform(X_test)


     # Generate histograms for both classes in both the training and test data sets
     # First compute vector sum of samples for training set
     #print("   Deterining the degree of fit between training and test data to a normal distribution.")
     col_names = X.columns
     df_X_scaled = pd.DataFrame(X_scaled, columns = col_names)
     if __TRAINING_TEST_SPLIT__ != None:
        df_X_test_scaled = pd.DataFrame(X_test_scaled, columns = col_names)
     
     # Make copy of data frames and compute vector sum in preparation to 
     # generate histograms
     df_X_scaled_vecsum = df_X_scaled
     df_X_scaled_vecsum['vec_sum'] = df_X_scaled_vecsum.apply(comp_vec_sum, axis = 1)
     if __TRAINING_TEST_SPLIT__ != None:
        df_X_test_scaled_vecsum = df_X_test_scaled
        df_X_test_scaled_vecsum['vec_sum'] = df_X_test_scaled_vecsum.apply(comp_vec_sum, axis = 1)
     


     # Determine fit of training and test data to a normal distribution
     # That is, test the underlying assumption of the VC Dimension that 
     # a normal disgtribution governs the distribution of the data. 
     # Using the API: scipy.stats.mstats.normaltest:

     # Extract the vector sum info from the train and test data sets
     X_scaled_hist_data = df_X_scaled_vecsum['vec_sum']
     if __TRAINING_TEST_SPLIT__ != None:
        X_test_scaled_hist_data = df_X_test_scaled_vecsum['vec_sum']
     
     # Compute degree of match of data to normal dist
     X_scaled_hr = normaltest(X_scaled_hist_data)
     X_scaled_hr_match = X_scaled_hr[0]
     X_scaled_hr_match_pvalue = X_scaled_hr[1]
     print("   Data set match to normal dist: %.1f  with p-value: %.4E" % \
             (X_scaled_hr_match, Decimal(X_scaled_hr_match_pvalue)))

     if __TRAINING_TEST_SPLIT__ != None:
        X_test_scaled_hr = normaltest(X_test_scaled_hist_data)
        X_test_scaled_hr_match = X_test_scaled_hr[0]
        X_test_scaled_hr_match_pvalue = X_test_scaled_hr[1]
        print("   Test data set match to normal dist:     %.1f  with p-value: %.4E" % \
                (X_test_scaled_hr_match, Decimal(X_test_scaled_hr_match_pvalue)))

     #print("Completed deterining the degree of fit of training and test data to normal distribution")
     #print("  for iteration: ", iter_ctr)
     
     # Display histograms for training and test data
     # See:  http://danielhnyk.cz/fitting-distribution-histogram-using-python/ 
     print("\n\nDisplaying histograms for data sets.")
     
     # Display training data first
     fig = plt.figure(fig_ctr, figsize = (__PLOT_SIZE_X__, __PLOT_SIZE_Y__)) 
     fig_ctr = 1 + fig_ctr
     plt.gcf().clear()
             
     X_scaled_hist_data.hist(normed = True)
     X_scaled_hist_data.plot(kind = 'kde', linewidth = 2, \
                             color = 'r', label = 'Distribution Of Training Data')
     
     # find minimum and maximum of xticks, so we know
     # where we should compute theoretical distribution
     xt = plt.xticks()[0]  
     xmin, xmax = min(xt), max(xt)  
     lnspc = np.linspace(xmin, xmax, len(X_scaled_hist_data))
     
     # Now display the normal distribution over the histogram of the 
     # training data
     m, s = stats.norm.fit(X_scaled_hist_data) # get mean and standard deviation  
     pdf_g = stats.norm.pdf(lnspc, m, s) # now get theoretical values in our interval  
     plt.plot(lnspc, pdf_g, label="Normal Distribution", color = 'k', linewidth = 2) # plot it
     
     plt.xlabel("Training data feature vector distance/magnitude.")
     plt.ylabel("Frequency.")
     match_val = '%.2f' % Decimal(X_scaled_hr_match)
     match_p_val = '%.4E' % Decimal(X_scaled_hr_match_pvalue)
     
     title_str = "Histrogram and Distribution of training data overlayed with normal distribution. " \
        + "  Degree of match = " + match_val + " with p-value = " + match_p_val + "."
     plt.title("\n".join(wrap(title_str, __MATPLOTLIP_TITLE_WIDTH__)))
     
     leg = plt.legend(loc = 'best', ncol = 1, shadow = True, fancybox = True)
     leg.get_frame().set_alpha(0.5)
     
     plt.show()
         
     if __TRAINING_TEST_SPLIT__ != None:
        # Display test dataset next
        fig = plt.figure(fig_ctr, figsize = (__PLOT_SIZE_X__, __PLOT_SIZE_Y__))
        fig_ctr = 1 + fig_ctr 
        plt.gcf().clear()
                
        X_test_scaled_hist_data.hist(normed = True)
        X_test_scaled_hist_data.plot(kind = 'kde', linewidth = 2, \
                                color = 'r', label = 'Distribution Of Test Data')
        
        # find minimum and maximum of xticks, so we know
        # where we should compute theoretical distribution
        xt = plt.xticks()[0]  
        xmin, xmax = min(xt), max(xt)  
        lnspc = np.linspace(xmin, xmax, len(X_test_scaled_hist_data))
        
        # Now display the normal distribution over the histogram of the test data
        m, s = stats.norm.fit(X_test_scaled_hist_data) # get mean and standard deviation  
        pdf_g = stats.norm.pdf(lnspc, m, s) # now get theoretical values in our interval  
        plt.plot(lnspc, pdf_g, label="Normal Distribution", color = 'k', linewidth = 2) # plot it
        
        plt.xlabel("Test data feature vector distance/magnitude.")
        plt.ylabel("Frequency.")
        match_val = '%.2f' % Decimal(X_test_scaled_hr_match)
        match_p_val = '%.4E' % Decimal(X_test_scaled_hr_match_pvalue) 
        
        title_str = "Histogram and Distribution of test data overlayed with normal distribution." \
           + "  Degree of match = " + match_val + " with p-value = " + match_p_val + "."
        
        plt.title("\n".join(wrap(title_str, __MATPLOTLIP_TITLE_WIDTH__)))
        
        leg = plt.legend(loc = 'best', ncol = 1, shadow = True, fancybox = True)
        leg.get_frame().set_alpha(0.5)
        
        plt.show() 


     #print("Completed displaying histograms for training and test data sets")
     #print("  for iteration: ", iter_ctr)
     
     # Increment iteration count
     iter_ctr = 1 + iter_ctr 
     
     if iter_ctr <= __NUM_ITERATIONS__:
        print("")
        #print("Starting iteration: ", iter_ctr) 
     else:
        print()        
Example #44
0
    meas_table = table[table["Measurement"] == descriptor]

    # how to acess statistical values
    values_list = meas_table[stat_value].tolist()

    # adding values of interest to table for visualization
    #dataframe[tables] = values_list

    ###
    data_d.update({tables: values_list})
    ###

    max_vals.append(np.max(values_list))


    if normaltest(values_list)[1] > 0.05:
        normtest = "| Parametric distribution"
        normtest_list.append(True)
    else:
        normtest = "| Non-parametric distribution"
        normtest_list.append(False)

    print(tables, normtest, normaltest(values_list)[1])

print("\n")

#print(data_d)

# converting dictionary with different list lengths into a pandas dataframe
dataframe = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_d.items()]))
# plot acf and pacf of signals
white_acf_pacf = plot_acf_pacf(signal=white_noise_signal, name='ACF_WN')
blue_acf_pacf = plot_acf_pacf(signal=blue_noise_signal, name='ACF_BN')
pink_acf_pacf = plot_acf_pacf(signal=pink_noise_signal, name='ACF_PN')

# plot periodogram of signals
hbo_specgram = plot_periodgram(signal=hbo_signal, name='FD_HBO', color='k')
white_specgram = plot_periodgram(signal=extract_signal(file_name='white_noise.wav', num_frames=441000), name='FD_WN',
                                 color='k')
blue_specgram = plot_periodgram(signal=extract_signal(file_name='blue_noise.wav', num_frames=441000), name='FD_BN',
                                color='b')
pink_specgram = plot_periodgram(signal=extract_signal(file_name='pink_noise.wav', num_frames=441000), name='FD_PN',
                                color='m')
plot_periodgram(plot_decomposition(), name='FD_SinFunc')

# adf test
hbo_adf = adfuller(hbo_signal)
white_noise_adf = adfuller(white_noise_signal)
blue_noise_adf = adfuller(blue_noise_signal)
pink_noise_adf = adfuller(pink_noise_signal)
# normality test
hbo_norm = normaltest(hbo_signal)
white_noise_norm = normaltest(white_noise_signal)
blue_noise_norm = normaltest(blue_noise_signal)
pink_noise_norm = normaltest(pink_noise_signal)
# histogram plot
plt.hist(white_noise_signal, bins=50)
plt.hist(blue_noise_signal, bins=50)
plt.hist(pink_noise_signal, bins=50)
print(
    "   Deterining the degree of fit between training and test data to a normal distribution."
)
col_names = X.columns
df_X_scaled = pd.DataFrame(X_scaled, columns=col_names)

# Make copy of data frames and compute vector sum in preparation to
# generate histograms
df_X_scaled_vecsum = df_X_scaled
df_X_scaled_vecsum['vec_sum'] = df_X_scaled_vecsum.apply(comp_vec_sum, axis=1)

# Extract the vector sum info from the train and test data sets
X_scaled_hist_data = df_X_scaled_vecsum['vec_sum']

# Compute degree of match of data to normal dist
X_scaled_hr = normaltest(X_scaled_hist_data)
X_scaled_hr_match = X_scaled_hr[0]
X_scaled_hr_match_pvalue = X_scaled_hr[1]

print("    Data set match to normal dist: %.1f  with p-value: %.4E" % \
        (X_scaled_hr_match, Decimal(X_scaled_hr_match_pvalue)))

print("Displaying histograms for the data set.")

fig = plt.figure(fig_ctr, figsize=(__PLOT_SIZE_X__, __PLOT_SIZE_Y__))
fig_ctr = 1 + fig_ctr
plt.gcf().clear()

X_scaled_hist_data.hist(normed=True)
X_scaled_hist_data.plot(kind = 'kde', linewidth = 2, \
                        color = 'r', label = 'Distribution Of The Data')
Example #47
0
new_x = numpy.hstack((add_column, x_matrix))

#matrices multiplication

step_one = numpy.dot (new_x.T, new_x)
step_two = numpy.linalg.pinv(step_one)
step_three = numpy.dot(step_two, new_x.T)


coeffs = numpy.dot(step_three, y_matrix)
errors = y_matrix - numpy.dot(new_x, coeffs)

print coeffs
#the model is too complicated (multidimentional)
#are errors distributed normally? if yes, then the model is accurate

import numpy as np
import numpy.ma as ma
from scipy.stats import mstats

x = np.array(errors) 

z,pval = mstats.normaltest(x) #Tests whether a sample differs from a normal distribution.
#This function tests the null hypothesis that a sample comes from a normal distribution
print "Z-score:", z 
print "P-value:", pval

if(pval < 0.055):
    print "Not normal distribution"
if (pval >= 0.055):
	print "This seems to be a normal distribution! Our model is good"