Example #1
0
def test_omni_normtest():
    #tests against R fBasics
    from scipy import stats
    st_pv_R = np.array(
              [[3.994138321207883, -1.129304302161460,  1.648881473704978],
               [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]])

    nt = omni_normtest(x)
    assert_almost_equal(nt, st_pv_R[:,0], 14)

    st = stats.skewtest(x)
    assert_almost_equal(st, st_pv_R[:,1], 14)

    kt = stats.kurtosistest(x)
    assert_almost_equal(kt, st_pv_R[:,2], 11)

    st_pv_R = np.array(
              [[34.523210399523926,  4.429509162503833,  3.860396220444025],
               [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]])

    x2 = x**2
    #TODO: fix precision in these test with relative tolerance
    nt = omni_normtest(x2)
    assert_almost_equal(nt, st_pv_R[:,0], 12)

    st = stats.skewtest(x2)
    assert_almost_equal(st, st_pv_R[:,1], 12)

    kt = stats.kurtosistest(x2)
    assert_almost_equal(kt, st_pv_R[:,2], 12)
Example #2
0
def test_omni_normtest():
    #tests against R fBasics
    from scipy import stats

    st_pv_R = np.array(
              [[3.994138321207883, -1.129304302161460,  1.648881473704978],
               [0.1357325110375005, 0.2587694866795507, 0.0991719192710234]])

    nt = omni_normtest(x)
    assert_almost_equal(nt, st_pv_R[:, 0], 14)

    st = stats.skewtest(x)
    assert_almost_equal(st, st_pv_R[:, 1], 14)

    kt = stats.kurtosistest(x)
    assert_almost_equal(kt, st_pv_R[:, 2], 11)

    st_pv_R = np.array(
              [[34.523210399523926,  4.429509162503833,  3.860396220444025],
               [3.186985686465249e-08, 9.444780064482572e-06, 1.132033129378485e-04]])

    x2 = x**2
    #TODO: fix precision in these test with relative tolerance
    nt = omni_normtest(x2)
    assert_almost_equal(nt, st_pv_R[:, 0], 12)

    st = stats.skewtest(x2)
    assert_almost_equal(st, st_pv_R[:, 1], 12)

    kt = stats.kurtosistest(x2)
    assert_almost_equal(kt, st_pv_R[:, 2], 12)
Example #3
0
def pd_reducer_ratios(pt, nan_policy='raise', **kwargs):
    '''
    Takes a pandas dataframe with s_date, price & research columns
    Top tip: use the _rs_to_ptbl(recordset) to convert your query (with the
    fields s_date, s_type and s_val) into a valid pandas pivot table :)
    input = [
        {'date':'2017-01-01', 'x':12.1, 'y':22  },
        {'date':'2017-01-02', 'x':13.7, 'y':32.2},
        {'date':'2017-01-03', 'x':11.7, 'y':12.8},
        ]
    Returns a dict.
    '''
    # axis==1 is column and axis==0 is row for all pandas operations requiring
    fn = "pandas_reducer_ratios"
    reducer_suite = __name__.split('.')[-1]

    try:
        from scipy import stats
        import pandas as pd
    except:
        raise ImportError('{} needs pandas and scipy')

    # Enforce our default in case it gets out of hand.
    if nan_policy not in nan_policies:
        raise AttributeError( \
            'nan_policy {} not accepted - try omit, raise or propagate'.format(
                nan_policy)
        )

    output = {}

    output['prx_mean'] = pt.price.mean()
    output['prx_kurtosis_st'] = stats.kurtosistest(pt.price,
                                                   nan_policy=nan_policy)[0]
    output['prx_kurtosis_pv'] = stats.kurtosistest(pt.price,
                                                   nan_policy=nan_policy)[1]
    output['prx_skewtest_st'] = stats.skewtest(pt.price,
                                               nan_policy=nan_policy)[0]
    output['prx_skewtest_pv'] = stats.skewtest(pt.price,
                                               nan_policy=nan_policy)[1]
    output['prx_corr'] = pt.price.corr(pt.research)
    output['rsch_mean'] = pt.research.mean()
    output['rsch_kurtosis_st'] = stats.kurtosistest(pt.research,
                                                    nan_policy=nan_policy)[0]
    output['rsch_kurtosis_pv'] = stats.kurtosistest(pt.research,
                                                    nan_policy=nan_policy)[1]
    output['rsch_skewtest_st'] = stats.skewtest(pt.research,
                                                nan_policy=nan_policy)[0]
    output['rsch_skewtest_pv'] = stats.skewtest(pt.research,
                                                nan_policy=nan_policy)[1]
    output['rsch_corr'] = pt.research.corr(pt.price)

    kur_ratio = float(output['prx_kurtosis_st']) / output['rsch_kurtosis_st']

    output['kurtosis_ratios'] = kur_ratio
    return output
def quick_perf_st(x, y, freq, rf):
    ''' Comprehensive Performance Analysis after running run_strategy() class method'''
    dfx = x.dropna()
    dfy = y.dropna()
    stats = {
        'Statistics': [
            'P&L', 'CAGR', 'Anual_Vol', '%_Positive', 'Skew', 'Kurtosis',
            'Kurtosis PV', 'Downside_Vol', 'Worst', 'Sharpe_Ratio',
            'Sortino_Ratio', 'Information_Ratio', 'Max_Drawdown',
            'Worst_3_drawdown_avg', 'Max_DD_Duration'
        ],
        'Strategy': [
            profit_loss(dfx),
            pl_CAGR(dfx),
            an_vol(dfx, freq),
            positive_per(dfx),
            s.skew(dfx),
            s.kurtosis(dfx),
            s.kurtosistest(dfx)[1],
            an_down_vol(dfx, freq),
            dfx.min(),
            sharpe(dfx, freq, rf),
            sortino(dfx, freq, rf),
            info_ratio(dfx, dfy, freq),
            max_draw(dfx),
            worst3_draw_avg(dfx),
            max_dd_duration(dfx)
        ],
        'Benchmark': [
            profit_loss(dfy),
            pl_CAGR(dfy),
            an_vol(dfy, freq),
            positive_per(dfy),
            s.skew(dfy),
            s.kurtosis(dfy),
            s.kurtosistest(dfy)[1],
            an_down_vol(dfy, freq),
            dfy.min(),
            sharpe(dfy, freq, rf),
            sortino(dfy, freq, rf),
            info_ratio(dfy, dfy, freq),
            max_draw(dfy),
            worst3_draw_avg(dfy),
            max_dd_duration(dfy)
        ]
    }

    df = pd.DataFrame(stats)
    df.set_index('Statistics', inplace=True)
    return df.round(4)
Example #5
0
def get_stats(a):
    """Computes mean, D_T or D_R, and standard error for a list.
    """
    a = np.asarray(a)
    n = a.shape[-1]
    keepdims = a.ndim > 1
    M = np.nanmean(a, -1, keepdims=keepdims)
    # c = a - M
    # variance = np.einsum('...j,...j->...', c, c)/n
    variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1)
    SE = np.sqrt(variance) / sqrt(n - 1)
    SK = skew(a, -1, nan_policy='omit')
    KU = kurtosis(a, -1, nan_policy='omit')
    SK_t = skewtest(a, -1, nan_policy='omit')
    KU_t = kurtosistest(a, -1, nan_policy='omit')
    if keepdims:
        SK = SK[..., None]
        KU = KU[..., None]
    else:
        SK = float(SK)
        KU = float(KU)
    stat = {
        'mean': M,
        'var': variance,
        'std': SE,
        'skew': SK,
        'skew_test': float(SK_t.statistic),
        'kurt': KU,
        'kurt_test': float(KU_t.statistic)
    }
    print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()])
    return stat
Example #6
0
 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
     xm = np.ma.array(np.r_[np.inf, x, 10], mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
Example #7
0
def normality_check(feature_group, output_path):

    if feature_group.isEmpty():
        return False

    normal_flag = True
    sk_test = stats.skewtest(feature_group.get_scores())
    kr_test = stats.kurtosistest(feature_group.get_scores())
    normaltest = stats.normaltest(feature_group.get_scores())

    temp = '''

			Normality Test P-Values
		------------------------------------
		 Kurtosis   |  {0}
		 Skewness   |  {1}
		 NormalTest |  {2}


	'''

    result = temp.format(kr_test[1], sk_test[1], normaltest[1])

    print result

    tests = (sk_test[1] > 0.05, kr_test[1] > 0.05, normaltest[1] > 0.05)

    return tests
Example #8
0
def print_market_information(benchmark):
    print("RETURN BENCHMARK STATISTICS")
    print("---------------------------------------------")
    print("Mean of Daily  Log Returns %9.6f" % np.mean(benchmark['returns']))
    print("Std  of Daily  Log Returns %9.6f" % np.std(benchmark['returns']))
    print("Mean of Annua. Log Returns %9.6f" %
          (np.mean(benchmark['returns']) * 252))
    print("Std  of Annua. Log Returns %9.6f" %
          (np.std(benchmark['returns']) * math.sqrt(252)))
    print("---------------------------------------------")
    print("Skew of Sample Log Returns %9.6f" % scs.skew(benchmark['returns']))
    print("Skew Normal Test p-value   %9.6f" %
          scs.skewtest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Kurt of Sample Log Returns %9.6f" %
          scs.kurtosis(benchmark['returns']))
    print("Kurt Normal Test p-value   %9.6f" %
          scs.kurtosistest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Normal Test p-value        %9.6f" %
          scs.normaltest(benchmark['returns'])[1])
    print("---------------------------------------------")
    print("Anderson Normality Test:		   ")
    print(stats.anderson(benchmark['returns']))
    return
Example #9
0
    def plot_indices_returns_distribution(self):
        fig, axes = plt.subplots(nrows=4,
                                 ncols=2,
                                 sharex=True,
                                 sharey=False,
                                 figsize=(12, 12))
        for i in range(4):
            for j in range(2):
                ts = self.__indices[2 * i + j].get_index_returns()

                # Statistics calculations
                ts = ts.dropna()
                mu, std = norm.fit(ts)
                kurtosis = stats.kurtosistest(ts).pvalue

                axes[i, j].axis('off')
                axes[i, j].set_title(self.__indices_names[2 * i + j])
                axes[i, j].hist(ts.dropna(), bins=30, normed=True, color='red')

                xmin, xmax = axes[i, j].get_xlim()
                x = np.linspace(xmin, xmax, 100)
                p = norm.pdf(x, mu, std)

                axes[i, j].fill_between(x, 0, p, color='grey', alpha='0.7')
                axes[i, j].plot(x, p, 'k', linewidth=2)

                title = "%s, mu=%.2f, sigma=%.2f, kurt_pv=%.2f" % (
                    self.__indices_names[2 * i + j], mu, std, kurtosis)

                axes[i, j].set_title(title)
                plt.suptitle("Distribution of indices returns ")
Example #10
0
def get_stats(a):
    """Computes mean, D_T or D_R, and standard error for a list.
    """
    a = np.asarray(a)
    n = a.shape[-1]
    keepdims = a.ndim > 1
    M = np.nanmean(a, -1, keepdims=keepdims)
    # c = a - M
    # variance = np.einsum('...j,...j->...', c, c)/n
    variance = np.nanvar(a, -1, keepdims=keepdims, ddof=1)
    SE = np.sqrt(variance)/sqrt(n - 1)
    SK = skew(a, -1, nan_policy='omit')
    KU = kurtosis(a, -1, nan_policy='omit')
    SK_t = skewtest(a, -1, nan_policy='omit')
    KU_t = kurtosistest(a, -1, nan_policy='omit')
    if keepdims:
        SK = SK[..., None]
        KU = KU[..., None]
    else:
        SK = float(SK)
        KU = float(KU)
    stat = {'mean': M, 'var': variance, 'std': SE,
            'skew': SK, 'skew_test': float(SK_t.statistic),
            'kurt': KU, 'kurt_test': float(KU_t.statistic)}
    print '\n'.join(['{:>10}: {: .4f}'.format(k, v) for k, v in stat.items()])
    return stat
Example #11
0
def normality_check(feature_group,output_path):

	if feature_group.isEmpty():
		return False

	
	normal_flag = True
	sk_test = stats.skewtest(feature_group.get_scores())
	kr_test = stats.kurtosistest(feature_group.get_scores()) 
	normaltest = stats.normaltest(feature_group.get_scores())

	temp = '''

			Normality Test P-Values
		------------------------------------
		 Kurtosis   |  {0}
		 Skewness   |  {1}
		 NormalTest |  {2}


	'''

	result = temp.format(kr_test[1],sk_test[1],normaltest[1])

	print result


	tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05)

	return tests
def check_lr_assumptions(df, data_fe):
    """
    prints multiple statistical tests and returns a dataframe containing residuals
    
    arguments
    ---------
    df: dataframe of truth and prediction columns labeled "truth" and "pred"
    data_fe: prepared features for prediction
    
    return
    ------
    dataframe
    
    """
    
    df['residuals'] = df['pred'] - df['truth']
    
    print("mean of residuals:", df['residuals'].mean())
    print("variance of residuals:", df['residuals'].var())
    print("skewness of residuals:", stats.skew(df.residuals))
    print("kurtosis of residuals:", stats.kurtosis(df.residuals))
    print("kurtosis test of residuals:", stats.kurtosistest(df.residuals))
    print("normal test of residuals (scipy stats):", stats.normaltest(df.residuals))
    print("Jarque Bera test for normality of residuals:", stats.jarque_bera(df.residuals))
    print("Breusch Pagan test for heteroscedasticity:", het_breuschpagan(df.residuals, data_fe))

    return df
def print_statistics(data):
    print("RETURN SAMPLE STATISTICS")
    print("---------------------------------------------")
    print("Mean of Daily  Log Returns %9.6f" % np.mean(data['returns']))
    print("Std  of Daily  Log Returns %9.6f" % np.std(data['returns']))
    print("Mean of Annua. Log Returns %9.6f" %
          (np.mean(data['returns']) * 252))
    print("Std  of Annua. Log Returns %9.6f" % \
          (np.std(data['returns']) * math.sqrt(252)))

    print("---------------------------------------------")
    print("Skew of Sample Log Returns %9.6f" % scs.skew(data['returns']))
    print("Skew Normal Test p-value   %9.6f" %
          scs.skewtest(data['returns'])[1])
    print("---------------------------------------------")
    print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns']))
    print("Kurt Normal Test p-value   %9.6f" % \
          scs.kurtosistest(data['returns'])[1])

    print("Normal Test p-value        %9.6f" % \
          scs.normaltest(data['returns'])[1])
    print("---------------------------------------------")

    print("Realized Volatility        %9.6f" % data['rea_vol'].iloc[-1])
    print("Realized Variance          %9.6f" % data['rea_var'].iloc[-1])
Example #14
0
def ica_experiment(X, name, dims, max_iter=5000, tol=1e-04):
    """Run ICA on specified dataset and saves mean kurtosis results as CSV
    file.

    Args:
        X (Numpy.Array): Attributes.
        name (str): Dataset name.
        dims (list(int)): List of component number values.

    """
    ica = FastICA(random_state=0, max_iter=max_iter, tol=tol)
    kurt = []
    loss = []

    X = StandardScaler().fit_transform(X)
    for dim in dims:
        print(dim)
        ica.set_params(n_components=dim)
        tmp = ica.fit_transform(X)
        df = pd.DataFrame(tmp)
        df = df.kurt(axis=0)
        kurt.append(kurtosistest(tmp).statistic.mean())
        proj = ica.inverse_transform(tmp)
        loss.append(((X - proj)**2).mean())

    res = pd.DataFrame({"kurtosis": kurt, "loss": loss})

    # save results as CSV
    resdir = 'results/ICA'
    resfile = get_abspath('{}_kurtosis.csv'.format(name), resdir)
    res.to_csv(resfile, index_label='n')
Example #15
0
def test_normalitytests():
    # numbers verified with R: dagoTest in package fBasics
    st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734)
    pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502,  0.98880019)
    x = np.array((-2,-1,0,1,2,3)*4)**2
    yield assert_array_almost_equal, stats.normaltest(x), (st_normal, pv_normal)
    yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew)
    yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
Example #16
0
def doStatTests(X, labels, ml):
    pca = PCA(n_components=1)
    pca.fit(X[labels == ml])
    XpcaML = pca.transform(X[labels == ml])
    labelsOut = labels
    normXpcaML = (XpcaML - n.mean(XpcaML)) / n.std(XpcaML)
    #maxKurt = kurtosistest(normXpcaML)[1]
    #maxSkew = skewtest(normXpcaML)[1]
    for i in n.unique(labels):
        if len(X[labels == i]) == 0:
            continue
        else:
            Xpca = pca.transform(X[labels == i])
            Xpca = (Xpca - n.mean(Xpca)) / n.std(Xpca)
            if len(Xpca) < 9:
                labelsOut[labels == i] = -1
                continue
            if False:
                if len(Xpca) < 9:
                    labelsOut[labels == i] = -1
                    continue
                pl.figure()
                if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5:
                    tag = 'RFI'
                else:
                    tag = 'Not RFI'
                sk = skewtest(Xpca)[1]
                kt = kurtosistest(Xpca)[1]
                sk1 = skewtest(XpcaML)[1]
                kt1 = kurtosistest(XpcaML)[1]
                pl.subplot(211)
                pl.hist(Xpca, 50, label=tag + ':' + str(sk) + ':' + str(kt))
                pl.legend()
                pl.subplot(212)
                pl.hist(XpcaML,
                        50,
                        label=tag + ':' + str(sk1) + ':' + str(kt1))
                pl.legend()
                pl.show()
            if i == ml:
                continue
            if skewtest(Xpca)[1] > 0.01:  #or kurtosistest(Xpca)[1] > 1.:
                labelsOut[labels == i] = -1
            #else:
            #    labelsOut[labels==i] = ml
    return labelsOut
Example #17
0
 def test_maskedarray_input(self):
     # Add some masked values, test result doesn't change
     x = np.array((-2,-1,0,1,2,3)*4)**2
     xm = np.ma.array(np.r_[np.inf, x, 10],
                      mask=np.r_[True, [False] * x.size, True])
     assert_allclose(mstats.normaltest(xm), stats.normaltest(x))
     assert_allclose(mstats.skewtest(xm), stats.skewtest(x))
     assert_allclose(mstats.kurtosistest(xm), stats.kurtosistest(x))
Example #18
0
def create_scipy_features(base_features, sentinel):
    r"""Calculate the skew, kurtosis, and other statistical features
    for each row.

    Parameters
    ----------
    base_features : numpy array
        The feature dataframe.
    sentinel : float
        The number to be imputed for NaN values.

    Returns
    -------
    sp_features : numpy array
        The calculated SciPy features.
    sp_fnames : list
        The SciPy feature names.

    """

    logger.info("Creating SciPy Features")

    # Generate scipy features

    logger.info("SciPy Feature: geometric mean")
    row_gmean = sps.gmean(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis")
    row_kurtosis = sps.kurtosis(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis test")
    row_ktest, pvalue = sps.kurtosistest(base_features, axis=1)
    logger.info("SciPy Feature: normal test")
    row_normal, pvalue = sps.normaltest(base_features, axis=1)
    logger.info("SciPy Feature: skew")
    row_skew = sps.skew(base_features, axis=1)
    logger.info("SciPy Feature: skew test")
    row_stest, pvalue = sps.skewtest(base_features, axis=1)
    logger.info("SciPy Feature: variation")
    row_var = sps.variation(base_features, axis=1)
    logger.info("SciPy Feature: signal-to-noise ratio")
    row_stn = sps.signaltonoise(base_features, axis=1)
    logger.info("SciPy Feature: standard error of mean")
    row_sem = sps.sem(base_features, axis=1)

    sp_features = np.column_stack(
        (row_gmean, row_kurtosis, row_ktest, row_normal, row_skew, row_stest,
         row_var, row_stn, row_sem))
    sp_features = impute_values(sp_features, 'float64', sentinel)
    sp_features = StandardScaler().fit_transform(sp_features)

    # Return new SciPy features

    logger.info("SciPy Feature Count : %d", sp_features.shape[1])
    sp_fnames = [
        'sp_geometric_mean', 'sp_kurtosis', 'sp_kurtosis_test',
        'sp_normal_test', 'sp_skew', 'sp_skew_test', 'sp_variation',
        'sp_signal_to_noise', 'sp_standard_error_of_mean'
    ]
    return sp_features, sp_fnames
Example #19
0
def test_normalitytests():
    # numbers verified with R: dagoTest in package fBasics
    st_normal, st_skew, st_kurt = (3.92371918, 1.98078826, -0.01403734)
    pv_normal, pv_skew, pv_kurt = (0.14059673, 0.04761502, 0.98880019)
    x = np.array((-2, -1, 0, 1, 2, 3) * 4)**2
    yield assert_array_almost_equal, stats.normaltest(x), (st_normal,
                                                           pv_normal)
    yield assert_array_almost_equal, stats.skewtest(x), (st_skew, pv_skew)
    yield assert_array_almost_equal, stats.kurtosistest(x), (st_kurt, pv_kurt)
Example #20
0
def doStatTests(X,labels,ml):
    pca = PCA(n_components=1)
    pca.fit(X[labels==ml])
    XpcaML = pca.transform(X[labels==ml])
    labelsOut = labels
    normXpcaML = (XpcaML-n.mean(XpcaML))/n.std(XpcaML)
    #maxKurt = kurtosistest(normXpcaML)[1]
    #maxSkew = skewtest(normXpcaML)[1]
    for i in n.unique(labels):
        if len(X[labels==i])==0:
            continue
        else:
            Xpca = pca.transform(X[labels==i])
            Xpca = (Xpca-n.mean(Xpca))/n.std(Xpca)
            if len(Xpca) < 9:
                labelsOut[labels==i] = -1
                continue
            if False:
                if len(Xpca) < 9:
                    labelsOut[labels==i] = -1
                    continue
                pl.figure()
                if skewtest(Xpca)[1] > 0.5 or kurtosistest(Xpca)[1] > 0.5:
                    tag = 'RFI'
                else:
                    tag = 'Not RFI'
                sk = skewtest(Xpca)[1]
                kt = kurtosistest(Xpca)[1]
                sk1 = skewtest(XpcaML)[1]
                kt1 = kurtosistest(XpcaML)[1]
                pl.subplot(211)
                pl.hist(Xpca,50,label=tag+':'+str(sk)+':'+str(kt))
                pl.legend()
                pl.subplot(212)
                pl.hist(XpcaML,50,label=tag+':'+str(sk1)+':'+str(kt1))
                pl.legend()
                pl.show()
            if i == ml:
                continue
            if skewtest(Xpca)[1] > 0.01: #or kurtosistest(Xpca)[1] > 1.:
                labelsOut[labels==i] = -1
            #else:
            #    labelsOut[labels==i] = ml
    return labelsOut
Example #21
0
def normality_stats(arr):
    """
    统计信息偏度,峰度,正态分布检测,p-value
        eg:
                input:

                2014-07-25    223.57
                2014-07-28    224.82
                2014-07-29    225.01
                               ...
                2016-07-22    222.27
                2016-07-25    230.01
                2016-07-26    225.93

                output:

                array skew = -0.282635248604699
                array skew p-value = 0.009884539532576725
                array kurt = 0.009313464006726946
                array kurt p-value = 0.8403947352953821
                array norm = NormaltestResult(statistic=6.6961445106692237, pvalue=0.035152053009441256)
                array norm p-value = 0.035152053009441256

                input:

                            tsla	bidu	noah	sfun	goog	vips	aapl
                2014-07-25	223.57	226.50	15.32	12.110	589.02	21.349	97.67
                2014-07-28	224.82	225.80	16.13	12.450	590.60	21.548	99.02
                2014-07-29	225.01	220.00	16.75	12.220	585.61	21.190	98.38
                ...	...	...	...	...	...	...	...
                2016-07-22	222.27	160.88	25.50	4.850	742.74	13.510	98.66
                2016-07-25	230.01	160.25	25.57	4.790	739.77	13.390	97.34
                2016-07-26	225.93	163.09	24.75	4.945	740.92	13.655	97.76

                output:

                array skew = [-0.2826 -0.2544  0.1456  1.0322  0.2095  0.095   0.1719]
                array skew p-value = [ 0.0099  0.0198  0.1779  0.      0.0539  0.3781  0.1124]
                array kurt = [ 0.0093 -0.8414 -0.4205  0.4802 -1.547  -0.9203 -1.2104]
                array kurt p-value = [ 0.8404  0.      0.0201  0.0461  1.      0.      0.    ]
                array norm = NormaltestResult(statistic=array([   6.6961,   52.85  ,    7.2163,   69.0119,    3.7161,
                69.3468, 347.229 ]), pvalue=array([ 0.0352,  0.    ,  0.0271,  0.    ,  0.156 ,  0.    ,  0.    ]))
                array norm p-value = [ 0.0352  0.      0.0271  0.      0.156   0.      0.    ]

    :param arr: pd.DataFrame or pd.Series or Iterable
    """
    log_func = logging.info if ABuEnv.g_is_ipython else print

    log_func('array skew = {}'.format(scs.skew(arr)))
    log_func('array skew p-value = {}'.format(scs.skewtest(arr)[1]))

    log_func('array kurt = {}'.format(scs.kurtosis(arr)))
    log_func('array kurt p-value = {}'.format(scs.kurtosistest(arr)[1]))

    log_func('array norm = {}'.format(scs.normaltest(arr)))
    log_func('array norm p-value = {}'.format(scs.normaltest(arr)[1]))
Example #22
0
def BasicSummary1(series):
    series_len = len(series)
    basiclist = [
        stats.skew(series),
        stats.skewtest(series)[1],
        stats.kurtosis(series),
        stats.kurtosistest(series)[1],
        stats.variation(series)
    ]
    return np.round(pd.Series(basiclist), decimals=6)
Example #23
0
def noise(fname, x0 = 100, y0 = 100, maxrad = 30):
    from astroML.plotting import hist
    hdulist = pf.open(fname)
    im = hdulist[0].data
    #print np.mean(im), np.min(im), np.max(im)
    #print im[95:105, 95:105]
    # x0, y0 = 100, 100
    xi, yi = np.indices(im.shape)
    R = np.sqrt( (yi - int(y0))**2. + (xi - int(x0))**2. )
    phot_a = np.zeros(maxrad + 1)
    phot_a[0] = 0
    
    bmasked = im * ((R > maxrad) * (R < maxrad + 20.))
    bdata = bmasked.flatten()
    #print bdata[bdata != 0.]
    #print len(bdata[bdata != 0.])
    #print len(bdata)
    
    plt.subplot(3, 1, 1)
    hist(bdata[bdata != 0.], bins = 'blocks')
    plt.xlabel('Flux')
    plt.ylabel('(Bayesian Blocks)')
    plt.title('Noise')
    #plt.show()
    
    plt.subplot(3, 1, 2)
    hist(bdata[bdata != 0.], bins = 50)
    plt.xlabel('Flux')
    plt.ylabel('(50 bins)')
    #plt.title('Noise (50 bins)')
    #plt.show()
    
    plt.subplot(3, 1, 3)
    hist(bdata[bdata != 0.], bins = 'knuth')
    plt.xlabel('Flux')
    plt.ylabel('(Knuth\'s Rule)')
    #plt.title('Noise (Knuth\'s Rule)')
    plt.show()
    
    A2, crit, sig = anderson(bdata[bdata != 0.], dist = 'norm')
    print 'A-D Statistic:', A2
    print ' CVs \t  Sig.'
    print np.vstack((crit, sig)).T

    normality = normaltest(bdata[bdata != 0.])
    print 'Normality:', normality

    skewness = skewtest(bdata[bdata != 0.])
    print 'Skewness:', skewness

    kurtosis = kurtosistest(bdata[bdata != 0.])
    print 'Kurtosis:', kurtosis

    print 'Mean:', np.mean(bdata[bdata != 0.])
    print 'Median:', np.median(bdata[bdata != 0.])
def trainingset_preprocessing(Data, MinMaxInfo, print_info=False):
    '''
       This function prepares the training set for pre-processing.
       Input: 
             1) Data: pandas DataFrame with all covariates ready for preprocessing.
             
             2) MinMaxInfo: dictionary with {'covariate name': {'min': [] or value, 'max': [] or value}}
             
             3) print_info: boolean - whether to show basic info about training set (True) or not (False).
    '''

    ## Create local copy
    Data_local = Data.copy()

    ## Datasets from input data
    Columns_features = [
        'age', 'sex', 'WBC/uL', 'Mono/uL', 'Linfo/uL', 'T CD4 %', 'T CD4/uL',
        'T CD8 %', 'T CD8/uL', 'CD4/CD8', 'NK %', 'NK/uL', 'B CD19 %',
        '% T CD4 HLADR POS', '% T CD8 HLADR POS', 'T NK-like %',
        'LRTE % dei CD4', 'Mono DR %', 'MONO DR IFI'
    ]
    # Excluded features: 'T CD3 %', 'T CD3/uL', 'T CD3/HLADR %', 'T CD3 HLA DR/uL',
    #                    'B CD19/uL', 'LRTE/uL', 'T CD8 HLADR %', 'T CD4 HLADR %'
    Columns_target = ['death', 'OS_days']
    Columns_dates = ['hospitalization_date', 'death_date', 'birth_date']
    #
    Data_X = Data_local.loc[:, Columns_features].astype(float)
    Data_Y = Data_local.loc[:, Columns_target].astype(float)
    Data_dates = Data_local.loc[:, Columns_dates].astype(float)
    Data_ID = Data_local.loc[:, ['ID']]
    Data_Age = Data_local.loc[:, ['age']]

    ## Apply x->log(1+x) where kurtosis is above threshold
    kurtosis_threshold = 6
    skew_threshold = -1.5
    X_kurtosis = kurtosistest(Data_X.values, axis=0,
                              nan_policy='omit').statistic
    X_skew = Data_X.skew(axis=0)
    Features_LogProcessing = {}
    for i, element in enumerate(Columns_features):
        Features_LogProcessing[element] = {'Reflection': False, 'Log': False}
        if (X_kurtosis[i] > kurtosis_threshold and element != 'sex'):
            Features_LogProcessing[element]['Log'] = True
            if (X_skew[element] < skew_threshold):
                Features_LogProcessing[element]['Reflection'] = True
                max_val = MinMaxInfo[element]['max']
                Data_X.loc[:,
                           element] = max_val - Data_X.loc[:, element].values
            Data_X.loc[:, element] = np.log(1 + Data_X.loc[:, element].values)

    ## Return preprocessed datasets
    return Data_X, Data_Y, Data_ID, Data_Age, Features_LogProcessing


# ---- # ---- # ---- # ---- # ---- # ---- # ---- # ---- #
Example #25
0
def normality_tests(arr):
    '''
  Tests for normality distribution of given data set.
  Parameters array: ndarray
  object to generate on
  '''
    print("Skew of data set %14.3f" % scs.skew(arr))
    print("Skew test p-value %14.3f" % scs.skewtest(arr)[1])
    print("Kurt of data set %14.3f" % scs.kurtosis(arr))
    print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1])
    print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
 def histo(self, s, x, y, bins=20):  # 绘制单个变量的直方图
     df = self.data[s]
     skewnes, sk = stats.skewtest(df)
     kurtosis, ku = stats.kurtosistest(df)
     sns.set(style="darkgrid")
     pc = sns.distplot(df, kde=True, bins=bins)
     plt.text(x=x,
              y=y,
              s='skewnes=%.2f\nkurtosis=%.2f' % (skewnes, kurtosis))
     name = 'the Histograme of {:s}'.format(s.capitalize())
     plt.suptitle(name)
     return pc
Example #27
0
    def test_vs_nonmasked(self):
        x = np.array((-2, -1, 0, 1, 2, 3) * 4) ** 2
        assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x))
        assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x))
        assert_array_almost_equal(mstats.kurtosistest(x), stats.kurtosistest(x))

        funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest]
        mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]
        x = [1, 2, 3, 4]
        for func, mfunc in zip(funcs, mfuncs):
            assert_raises(ValueError, func, x)
            assert_raises(ValueError, mfunc, x)
Example #28
0
def normality_test(arr):
    '''
    Robust normality test based on skewness, kurtosis, and normality

    :param arr: obj to generate statistics on
    '''

    print("Skew of data set  %14.3f" % scs.skew(arr))
    print("Skew test p-value %14.3f" % scs.skewtest(arr)[1])
    print("Kurt of sata set  %14.3f" % scs.kurtosis(arr))
    print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1])
    print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
Example #29
0
def print_stock_statistics(data):
    print("RETURN SAMPLE STATISTICS")
    print("---------------------------------------------")
    print("Mean of Daily  Log Returns %9.6f" % np.mean(returns))
    print("Std  of Daily  Log Returns %9.6f" % np.std(returns))
    print("Mean of Annua. Log Returns %9.6f" % (np.mean(returns) * 252))
    print("Std  of Annua. Log Returns %9.6f" %
          (np.std(returns) * math.sqrt(252)))
    print("---------------------------------------------")
    print("Skew of Sample Log Returns %9.6f" % scs.skew(returns))
    print("Skew Normal Test p-value   %9.6f" % scs.skewtest(returns)[1])
    print("---------------------------------------------")
    print("Kurt of Sample Log Returns %9.6f" % scs.kurtosis(returns))
    print("Kurt Normal Test p-value   %9.6f" % scs.kurtosistest(returns)[1])
    print("---------------------------------------------")
    print("Normal Test p-value        %9.6f" % scs.normaltest(returns)[1])
    print("---------------------------------------------")
    print("Realized Volatility        %9.6f" % data['rea_vol'].iloc[-1])
    print("Realized Variance          %9.6f" % data['rea_var'].iloc[-1])
    print("---------------------------------------------")
    print("Anderson Normality Test:		   ")
    print(stats.anderson(returns))
    print("---------------------------------------------")
    print("Shapiro_Wilk Test: 		      			")
    print(stats.shapiro(returns))
    print("Sharpe Ratio of Daily Returns:  			")
    print("{0:.8f}".format(np.mean(returns) / np.std(returns)))
    print("Trading Sharpe for Daily:      			")
    print("{0:.8f}".format(
        (n * 6.5) *
        (np.mean(returns) - rf // np.std(returns) * np.sqrt(n * 6.5))))
    print("Sharpe of Annua. Returns w/ days:      	")
    print("{0:.8f}".format(
        (252) * (np.mean(returns) - rf // np.std(returns) * np.sqrt(252))))
    print("Sharpe of Annua. Returns w/ days & hours:")
    print("{0:.8f}".format(
        (252 * 6.5) *
        (np.mean(returns) - rf // np.std(returns) * np.sqrt(252 * 6.5))))
    print("---------------------------------------------")
    print("Amihud Illiquidity 		  %9.6g" %
          np.mean(np.divide(abs(returns), dollar_vol[1:])))
    print("---------------------------------------------")
    print("Kelly Formula:			       ")
    print("{0:.8f}".format(np.mean(returns) - rf // (np.std(returns))**2))
    print("Compounded Levered Return: 	   ")
    print("{0:.8f}".format(rf + (
        ((252) *
         (np.mean(returns) - rf / np.std(returns) * np.sqrt(252)))**2) // 2))
    print("Compounded Unlevered Return:    ")
    print("{0:.8f}".format(((np.mean(returns)) * 252) -
                           (((np.std(returns)) * np.sqrt(252))**2) // 2))
    return
Example #30
0
def normality_test(data):
    """ Tests for normality distribution of given data set
    (skew, skew-p, kurtosis, kurtosis-p, normality-p)
    data: ndarray object to generate statistics on 
    """

    print()
    print("Skew of data set %14.3f" % scs.skew(data))
    print("Skew test p-value %14.3f" % scs.skewtest(data)[1])
    print("Kurtosis of data set %14.3f" % scs.kurtosis(data))
    print("Kurtosis test p-value %14.3f" % scs.kurtosistest(data)[1])
    print("Normality test p-value %14.3f" % scs.normaltest(data)[1])
    print()
Example #31
0
def normality_test(array):
    '''
    对给定的数据集进行正态性检验
    组合了3中统计学测试
    偏度测试(Skewtest)——足够接近0
    峰度测试(Kurtosistest)——足够接近0
    正态性测试
    '''
    print 'Skew of data set %15.3f' % scs.skew(array)
    print 'Skew test p-value %14.3f' % scs.skewtest(array)[1]
    print 'Kurt of data set %15.3f' % scs.kurtosis(array)
    print 'Kurt test p-value %14.3f' % scs.kurtosistest(array)[1]
    print 'Norm test p-value %14.3f' % scs.normaltest(array)[1]
Example #32
0
def normality_test(arr):
    '''Tests for normality distribution of givven data set.
    
    Parameters
    ==========
    array: ndarray object to generates statistics on
    '''
    
    print 'Skew of data set %14.3f' %scs.skew(arr)
    print 'Skew test p value %14.3f' %scs.skewtest(arr)[1]
    print 'Kurt of data set %14.3f' %scs.kurtosis(arr)
    print 'Kurt test p value %14.3f' %scs.kurtosistest(arr)[1]
    print 'Normal test p value %14.3f' %scs.normaltest(arr)[1]
Example #33
0
def kurtosisstats(timecourse):
    """

    Parameters
    ----------
    timecourse: array
        The timecourse to test

    :return:

    """
    testres = kurtosistest(timecourse)
    return kurtosis(timecourse), testres[0], testres[1]
Example #34
0
    def test_vs_nonmasked(self):
        x = np.array((-2,-1,0,1,2,3)*4)**2
        assert_array_almost_equal(mstats.normaltest(x), stats.normaltest(x))
        assert_array_almost_equal(mstats.skewtest(x), stats.skewtest(x))
        assert_array_almost_equal(mstats.kurtosistest(x),
                                  stats.kurtosistest(x))

        funcs = [stats.normaltest, stats.skewtest, stats.kurtosistest]
        mfuncs = [mstats.normaltest, mstats.skewtest, mstats.kurtosistest]
        x = [1, 2, 3, 4]
        for func, mfunc in zip(funcs, mfuncs):
            assert_raises(ValueError, func, x)
            assert_raises(ValueError, mfunc, x)
Example #35
0
    def normalityTests(self):

        # Convert the close price into log valves
        logReturn = np.log(self.data.close / self.data.close.shift(1))

        # The shift command will add a NaN to the beginning of the data, this will need to be removed.
        logReturn = logReturn.dropna()

        print('Skew of data set  %14.3f' % scs.skew(logReturn))
        print('Skew test p-value %14.3f' % scs.skewtest(logReturn)[1])
        print('Kurt of data set  %14.3f' % scs.kurtosis(logReturn))
        print('Kurt test p-value %14.3f' % scs.kurtosistest(logReturn)[1])
        print('Norm test p-value %14.3f' % scs.normaltest(logReturn)[1])
Example #36
0
def normality_test(arr):
    '''Tests for normality distribution of givven data set.
    
    Parameters
    ==========
    array: ndarray object to generates statistics on
    '''

    print 'Skew of data set %14.3f' % scs.skew(arr)
    print 'Skew test p value %14.3f' % scs.skewtest(arr)[1]
    print 'Kurt of data set %14.3f' % scs.kurtosis(arr)
    print 'Kurt test p value %14.3f' % scs.kurtosistest(arr)[1]
    print 'Normal test p value %14.3f' % scs.normaltest(arr)[1]
Example #37
0
def get_normality(data: pd.DataFrame) -> pd.DataFrame:
    """
    Look at the distribution of returns and generate statistics on the relation to the normal curve.
    This function calculates skew and kurtosis (the third and fourth moments) and performs both
    a Jarque-Bera and Shapiro Wilk test to determine if data is normally distributed.

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe of targeted data

    Returns
    -------
    pd.DataFrame
        Dataframe containing statistics of normality
    """
    # Kurtosis
    # Measures height and sharpness of the central peak relative to that of a standard bell curve
    k, kpval = stats.kurtosistest(data)

    # Skewness
    # Measure of the asymmetry of the probability distribution of a random variable about its mean
    s, spval = stats.skewtest(data)

    # Jarque-Bera goodness of fit test on sample data
    # Tests if the sample data has the skewness and kurtosis matching a normal distribution
    jb, jbpval = stats.jarque_bera(data)

    # Shapiro
    # The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
    sh, shpval = stats.shapiro(data)

    # Kolmogorov-Smirnov
    # The one-sample test compares the underlying distribution F(x) of a sample against a given distribution G(x).
    # Comparing to normal here.
    ks, kspval = stats.kstest(data, "norm")

    l_statistic = [k, s, jb, sh, ks]
    l_pvalue = [kpval, spval, jbpval, shpval, kspval]

    return pd.DataFrame(
        [l_statistic, l_pvalue],
        columns=[
            "Kurtosis",
            "Skewness",
            "Jarque-Bera",
            "Shapiro-Wilk",
            "Kolmogorov-Smirnov",
        ],
        index=["Statistic", "p-value"],
    )
Example #38
0
    def calculate(self):

        if not self.recalc:
            raise ValueError("Please set recalc to True")

        data = np.array(self.dataset() or [])
        self.obs_number = len(data)
        self.recalc = False

        if self.obs_number == 0:
            return

        self.avg = np.average(data)
        self.q25, self.median, self.q75 = np.percentile(data, (25, 50, 75))
        modedata = stats.mode(data)
        if modedata[1][0] > 1:
            self.mode = modedata[0][0]
        self.min = np.min(data)
        self.max = np.max(data)
        self.sum = np.sum(data)
        self.Q = extra_stats.Q(data)
        self.TRI = extra_stats.TRI(data)
        self.MID = extra_stats.MID(data)

        self.var = np.var(data)
        self.std = np.std(data)
        self.range = extra_stats.range(data)
        self.MD = extra_stats.MD(data)
        self.MeD = extra_stats.MeD(data)

        self.variation = stats.variation(data)
        self.varQ = extra_stats.varQ(data)

        if self.obs_number > 1:
            self.Sp_pearson = extra_stats.Sp_pearson(data)
            self.H1_yule = extra_stats.H1_yule(data)
            self.H3_kelly = extra_stats.H3_kelly(data)
        else:
            self.Sp_pearson = None
            self.H1_yule = None
            self.H3_kelly = None

        self.kurtosis = stats.kurtosis(data)

        if self.obs_number >= 20:
            self.kurtosis_test_z_score, self.kurtosis_test_p_value = (
                stats.kurtosistest(data))
        else:
            self.kurtosis_test_z_score, self.kurtosis_test_p_value = None, None
Example #39
0
def normality_tests(arr):
    '''
    Tests for normality distribution of given data set.
    normality_tests函数组合了3中不同的统计学测试:
    偏斜度测试(Skewtest)
    测试样本数据的偏斜度是否“正态”(也就是值足够接近0)
    峰度测试(kurtosistest)与上一种测试类似,测试样本数据的峰度是否“正态”(同样是值足够接近0)
    正态性测试(normaltest)
    组合其他两种测试方法,检验正态性
    '''
    print("Skew of data set  %14.3f" % scs.skew(arr))
    print("Skew test p-value %14.3f" % scs.skewtest(arr)[1])
    print("Kurt of data set  %14.3f" % scs.kurtosis(arr))
    print("Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1])
    print("Norm test p-value %14.3f" % scs.normaltest(arr)[1])
Example #40
0
def print_statistics(data):
    print "RETURN SAMPLE STATISTICS"
    print "---------------------------------------------"
    print "Mean of Daily  Log Returns %9.6f" % np.mean(data['returns'])
    print "Std  of Daily  Log Returns %9.6f" % np.std(data['returns'])
    print "Mean of Annua. Log Returns %9.6f" % (np.mean(data['returns']) * 252)
    print "Std  of Annua. Log Returns %9.6f" % \
                (np.std(data['returns']) * math.sqrt(252))
    print "---------------------------------------------"
    print "Skew of Sample Log Returns %9.6f" % scs.skew(data['returns'])
    print "Skew Normal Test p-value   %9.6f" % scs.skewtest(data['returns'])[1]
    print "---------------------------------------------"
    print "Kurt of Sample Log Returns %9.6f" % scs.kurtosis(data['returns'])
    print "Kurt Normal Test p-value   %9.6f" % \
                scs.kurtosistest(data['returns'])[1]
    print "---------------------------------------------"
    print "Normal Test p-value        %9.6f" % \
                scs.normaltest(data['returns'])[1]
    print "---------------------------------------------"
    print "Realized Volatility        %9.6f" % data['rea_vol'].iloc[-1]
    print "Realized Variance          %9.6f" % data['rea_var'].iloc[-1]
Example #41
0
def normality_check(feature_group,group_name):

	if feature_group.isEmpty():
		return False

	
	normal_flag = True
	sk_test = stats.skewtest(feature_group.get_scores())
	kr_test = stats.kurtosistest(feature_group.get_scores()) 
	normaltest = stats.normaltest(feature_group.get_scores())

	temp = '''
  Normality Test P-Values[{}]
------------------------------------
Kurtosis   |  {}
Skewness   |  {}
NormalTest |  {}
	'''

	result = temp.format(group_name,kr_test[1],sk_test[1],normaltest[1])
	tests = (sk_test[1] > 0.05 ,kr_test[1] > 0.05 ,normaltest[1] > 0.05)

	return result,tests
Example #42
0
    cov_matrix = np.corrcoef(x.flat, y.flat)
    r2_xx = cov_matrix[0, 0]
    r2_xy = cov_matrix[0, 1]
    r2_yx = cov_matrix[1, 0]
    r2_yy = cov_matrix[1, 1]

    skew_x = stats.skew(x.flat, 0, bias=False)
    skew_y = stats.skew(y.flat, 0, bias=False)

    skew_xz, skew_xp = stats.skewtest(x.flat, 0)
    skew_yz, skew_yp = stats.skewtest(y.flat, 0)

    kurtosis_x = stats.kurtosis(x.flat, 0, bias=False)
    kurtosis_y = stats.kurtosis(y.flat, 0, bias=False)

    kurtosis_xz, kurtosis_xp = stats.kurtosistest(x.flat, 0)
    kurtosis_yz, kurtosis_yp = stats.kurtosistest(y.flat, 0)

    results = collections.OrderedDict()

    results["x_path"] = os.path.basename(opts.inputA)
    results["y_path"] = os.path.basename(opts.inputB)

    results["x"] = x
    results["y"] = y

    results["x_avg"] = x_avg
    results["y_avg"] = y_avg

    results["x_std"] = x_std
    results["y_std"] = y_std
Example #43
0
            #datap[j,i] = data[j,i] - biasp[j,i]

    return biasf, dataf  # ,biasp,datap
if __name__ == "__main__":

    files = ['big0.csv', 'big1.csv', 'big2.csv', 'big3.csv']
    [observations, temp] = parse.separate(files)
    for obs in observations:
        [biasf, dataf] = removeBias(obs)

        plt.close('all')

        for i in range(obs.shape[0]):
            biasrate = np.array(np.diff(biasf[i, :]))

            print "\nTest for biasrate is ", stats.kurtosistest(biasrate)
            print "Test for white noise is ", stats.kurtosistest(dataf[i, :])
            print "Test for bias is ", stats.kurtosistest(biasf[i, :])
            print "Test for observation is ",
            stats.kurtosistest(obs[i, :]), "\n"

            plt.figure(i + 1)
            plt.clf()

            plt.subplot2grid((2, 2), (0, 0))
            plt.hist(obs[i, :], color='r')

            plt.subplot2grid((2, 2), (0, 1))
            plt.hist(dataf[i, :], color='g')

            plt.subplot2grid((2, 2), (1, 0))
 def kurtosistp(self, x):
     return kurtosistest(x)[1]
Example #45
0
#scipy.io包的函数可以在Python中加载或保存MATLAB和Octave的矩阵和数组
#loadmat函数可以加载.mat文件。savemat函数可以将数组和指定的变量名字典保存为.mat文件
a = np.arange(7)
io.savemat("a.mat",{"array":a})

print u"分析随机数"
from scipy import stats
import matplotlib.pyplot as plt
#使用scipy.stats包按正态分布生成随机数
generated = stats.norm.rvs(size=900)
#用正态分布去拟合生成的数据,得到均值和标准差
print "Mean","Std",stats.norm.fit(generated)
#偏度描述的是概率分布的偏斜程度。
print "Skewtest","pvalue",stats.skewtest(generated)
#峰度描述的是概率分布曲线的陡峭程度
print "Kurtosistest","pvalue",stats.kurtosistest(generated)
#正态性检验可以检查数据集服从正态分布程度
print "Normaltest","pvalue",stats.normaltest(generated)
#得到数据所在的区段中某一百分比处的数值
print "95 percentile",stats.scoreatpercentile(generated,95)
#从数值1出发找到对应的百分比
print "Percentile at 1",stats.percentileofscore(generated,1)
plt.hist(generated)
# plt.show()

print u"比较对数收益率"
# from matplotlib.finance import quotes_historical_yahoo
# from datetime import date
# from statsmodels.stats.stattools import jarque_bera
# def get_close(symbol):
# 	today = date.today()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
df['abc'].hist(bins = 100, figsize = (8, 6))
#%%

# QQ-plot

# quantile-quantile plot

# to verify if this distribution is normal or not

import statsmodels.api as sm
import matplotlib.pyplot as plt
sm.qqplot(df['abc'].dropna(), line = 's')
plt.grid(True)
plt.xlabel('theoretical quantiles')
plt.ylabel('sample quantiles')
#%%

# skew and kurtosis

import scipy.stats as scs

data = df['floats'].dropna() # remove missing value
print('skew is %f' %scs.skew(data))
print('skew test p-value is %f' %scs.skewtest(data)[1])
print('kurt is %f' %scs.kurtosis(data))
print('kurt test p-value is %f' % scs.kurtosistest(data)[1])
print('normal test p-value is %f' %scs.normaltest(data)[1])
#%%

def normality_test(arr):
    print "skew %14.3f"% scs.skew(arr)
    print "skew test p-value %14.3f" % scs.skewtest(arr)[1]
    print "Kurt of data set %14.3f" % scs.kurtosis(arr)
    print "Kurt test p-value %14.3f" % scs.kurtosistest(arr)[1]
    print "Norm test p-value %14.3f" % scs.normaltest(arr)[1]
Example #49
0
def BasicSummary1(series):
	series_len = len(series)
	basiclist=[stats.skew(series), stats.skewtest(series)[1], stats.kurtosis(series),stats.kurtosistest(series)[1],stats.variation(series)]
	return np.round(pd.Series(basiclist),decimals=6)
mvi2mot = {}
for recname in sorted(mot):
    rec = eval(recname)
    mviname = os.path.basename(rec.e0.s.fname)
    framei0, framei1 = rec.e0.d.framei[0], rec.e0.d.framei[-1]
    print('%s: %s, frameis %d:%d' % (recname, mviname, framei0, framei1))
    mvi2mot[(mviname, framei0)] = mot[recname]
allmotion = np.hstack(list(mvi2mot.values()))
allmotion = np.hstack([allmotion, -allmotion]) # make it symmetric around 0
motionbins = np.arange(-300, 300+MOTIONBINW, MOTIONBINW) # deg/s, symmetric around 0
midbins = motionbins[:-1] + MOTIONBINW / 2
motioncount = np.histogram(allmotion, bins=motionbins)[0]
k = kurtosis(allmotion)
# kurtosistest() seems to use the method of Anscombe & Glynn (1983),
# http://biomet.oxfordjournals.org/content/70/1/227
z, p = kurtosistest(allmotion)
pstring = 'p < %g' % ceilsigfig(p)
# normally distributed signal with same std as data, to check that its kurtosis is 0:
#nsamples = 10000000
#normal = scipy.random.normal(0, allmotion.std(), nsamples)
#normalcount = np.histogram(normal, bins=motionbins)[0]
normalcount = core.g(0, allmotion.std(), midbins) # generate normal distrib directly
# normalize to get same probability mass:
normalcount = normalcount / normalcount.sum() * motioncount.sum()
plot(midbins, normalcount, marker=None, ls='-', c='0.7', lw=2)
plot(midbins, motioncount, marker=None, ls='-', c='k', lw=2)
text(0.98, 0.98, 'k = %.1f' % k, # kurtosis
     horizontalalignment='right', verticalalignment='top',
     transform=gca().transAxes, color='k')
text(0.98, 0.90, '%s' % pstring, # p-value of null (normal) hypothesis of kurtosis test
     horizontalalignment='right', verticalalignment='top',
Example #51
0
def cluster(data_array, target_array, n_components, perform_nn, max_k, max_features):
    # normalize numeric values for better performance per docs
    data_array = StandardScaler().fit_transform(data_array, target_array)

    target2 = None
    test_target = None

    if target_array is not None:
        data2, target2 = shuffle(data_array, target_array, random_state=1)
    else:
        data2 = shuffle(data_array, random_state=1)

    # split training and testing data 70/30
    offset = int(0.7*len(data2))
    train_data = data2[:offset]
    test_data = data2[offset:]

    if target_array is not None:
        train_target = target2[:offset]
        test_target = target2[offset:]

    best_k_rs, scores = clusterK(train_data, test_data, max_k)
    plot_chart(scores, 'Silhouette Score', 'K Means', 'k')

    best_em_rs, scores = clusterEM(train_data, test_data, max_k)
    plot_chart(scores, 'Silhouette Score', 'EM', 'Components')

    plot_eigenvalues(PCA(), train_data, 'PCA')
    t0 = int(round(time.time() * 1000))
    PCA().fit_transform(train_data)
    t1 = int(round(time.time() * 1000))
    pca_time = t1-t0
    print('PCA time: %d ms' % pca_time)

    reduce_and_cluster(build_pca, train_data, test_data, max_features, n_components, max_k, 'PCA')

    # calculate and plot kurtosis to determine any components that can be dropped
    t0 = int(round(time.time() * 1000))
    ica = FastICA(max_iter=18000, random_state=1)
    s = ica.fit_transform(train_data)
    t1 = int(round(time.time() * 1000))
    ica_time = t1-t0
    print('ICA time: %d ms' % ica_time)
    z_score, p_score = stats.kurtosistest(s)
    pl.plot(z_score, 'ro')
    pl.title('Kurtosis Z-scores')
    pl.ylabel('Z-score')
    pl.xlabel('Component Index')
    pl.show()
    reduce_and_cluster(build_ica, train_data, test_data, max_features, n_components, max_k, 'ICA')

    rp_c, rp_rs = find_best_rp(train_data, test_data, 2, max_features)

    reduce_and_cluster(build_rp, train_data, test_data, max_features, rp_c, max_k, 'RP', rp_rs)

    reduce_and_cluster(build_svd, train_data, test_data, max_features-1, n_components, max_k, 'SVD')

    if perform_nn:
        errs = []
        durs = []
        err, dur = reduce_and_learn_nn(PCA(n_components=n_components), train_data, train_target, test_data, test_target)
        errs.append(err)
        durs.append(dur)
        # ICA had 4 components as the best
        err, dur = reduce_and_learn_nn(FastICA(max_iter=18000, random_state=1, n_components=4), train_data,
                                       train_target, test_data, test_target)
        errs.append(err)
        durs.append(dur)
        # num components = 7 for RP
        err, dur = reduce_and_learn_nn(GaussianRandomProjection(n_components=rp_c, random_state=rp_rs), train_data,
                                       train_target, test_data, test_target)
        errs.append(err)
        durs.append(dur)
        err, dur = reduce_and_learn_nn(TruncatedSVD(n_components=n_components, algorithm='arpack', random_state=1), train_data,
                                       train_target, test_data, test_target)
        errs.append(err)
        durs.append(dur)
        ind = np.arange(len(errs))
        bar_width = 0.3
        fig, ax = plt.subplots()
        ax.bar(ind, errs, bar_width, color='b')
        ax.set_ylabel('Mean Squared Error')
        ax.set_title('Error After Dimension Reduction')
        ax.set_xticks(ind+.15)
        ax.set_xticklabels(('PCA', 'ICA', 'RP', 'SVD'))
        pl.show()

        cluster_and_learn_nn(train_data, train_target, test_data, test_target)