def get_stat_overview(y): return pd.Series( data=[ y.mean(), y.median(), y.max(), y.min(), y.std(), y.skew(), y.kurtosis(), jarque_bera(y)[0], jarque_bera(y)[1], ], index=[ "Mean", "Median", "Max", "Min", "Std", "Skewness", "Kurtosis", "Jarque_Bera", "Jarque_Bera_p", ], )
def jb(x, test=True): ''' the lower the best ''' np.random.seed(12345678) if test: return stats.jarque_bera(x)[0] return stats.jarque_bera(x)[1]
def diagnostic(x,y,sig_lv=sig_lv,cor=correlation): rslt={} p_x=[ss.jarque_bera(x.iloc[:,i]) for i in range(len(x.columns))] n_test_x=['Reject_H0' if p_x[i][1]<sig_lv else 'Not_Reject_H0' for i in range(len(p_x))] p_y=[ss.jarque_bera(y.iloc[:,i]) for i in range(len(y.columns))] n_test_y=['Reject_H0' if p_y[i][1]<sig_lv else 'Not_Reject_H0' for i in range(len(p_y))] corre=x.corr() rslt['mutli']=[{x.columns[i]+' '+x.columns[j]:str(np.absolute(corre.iloc[i,j])>0.5)} for i in range(len(x.columns)-1) for j in range(i+1,len(x.columns))] rslt['JB_Test']=[{x.columns[i]:{'p':p_x[i][1],'rst':n_test_x[i]}} for i in range(len(x.columns))]+[{y.columns[i]:{'p':p_y[i][1],'rst':n_test_y[i]}} for i in range(len(y.columns))] return rslt
def norm_cal(self, x): '''Calculate the normality of a single variable x. Parameters: ---------- x : numpy.ndarray Returns: ------- x_res : dict 'Statistic': statistic value calculated by the test 'Pvalue': p-value calculated by the test 'Critical': critical value if Anderson-Darling is used 'Test': name of the test used 'Sample size': sample size of the variable 'Result': bool, True if p-value < .5, False otherwise Notes: ----- More conservative cutoff numbers of 3500 and 50 are chosen based on below test conventions: Jarque_bera requires 2000+ samples; Shapiro-Wilk is accurate under 5000; And common difinition of small sample size is 30''' x_res = {} if len(x) >= 3500: # Use Jarque_bera for samples larger 3500 x_res['Statistic'] = ss.jarque_bera(x)[0] x_res['Pvalue'] = ss.jarque_bera(x)[1] x_res['Test'] = 'Jarque Bera Test' x_res['Sample Size'] = x.shape elif len(x) >= 50: # Use Shapiro-Wilk for samples [50 3500) x_res['Statistic'] = ss.shapiro(x)[0] x_res['Pvalue'] = ss.shapiro(x)[1] x_res['Test'] = 'Shapiro-Wilk Test' x_res['Sample Size'] = x.shape else: # Use Anderson-Darling for samples less than 50 x_res['Statistic'] = ss.anderson(x)[0][2] x_res['Critical'] = ss.anderson(x)[1][2] x_res['Test'] = 'Aderson-Darling Test' x_res['Sample Size'] = x.shape if x_res['Test'] != 'Aderson-Darling Test': if x_res['Pvalue'] < .05: # Fixed significance level x_res['Result'] = False else: x_res['Result'] = True else: # Anderson-Darling result has to be specially handled if x_res['Critical'] < x_res['Statistic']: x_res['Result'] = False else: x_res['Result'] = True return x_res
def compute_jarque_bera(arr): """ H_0 : distribution is normal at 99% confidence level H_1 : distribution is not normal at 99% confidence level - checks whether a distribution has skewness and kurtosis values matching that of a normal distribution - result is a non-negative value - the farther from zero, the greater it deviates from normal distribution """ value = jarque_bera(arr)[0] p_value = jarque_bera(arr)[1] print("The Jarque-Bera test statistic value is", value, "with probability of", p_value)
def verificar_distribuicao_normal(self, arr, p_value=0.05): """ Função responsavel por verificar a normalidade de uma distribuição utilizando o método jarque bera. Sendo assim rejeitar_h0 sendo `False` então a distribuição é normal Parâmetros: arr: list or array p-value: float (Nível de significância padrão p_value=0.05) Retornos: rejeitar_h0: bool """ return (sct.jarque_bera(arr)[1], bool(sct.jarque_bera(arr)[1] >= p_value))
def test_calcula_pvalue(): # No podemos afirmar que FRECUENCIAS es normal jb_frec = calcula_jarque_bera(FRECUENCIAS) print("Pvalue - Frecuencias: ", calcula_pvalue(jb_frec, valores_chi2_2, pvalues_chi2_2)) # No podemos afirmar que PESOS es normal jb_peso = calcula_jarque_bera(PESOS) print("Pvalue - Pesos: ", calcula_pvalue(jb_peso, valores_chi2_2, pvalues_chi2_2)) # Podemos afirmar que SINTETICA es normal con un nivel de significación de 0.95 jb_sint = calcula_jarque_bera(SINTETICA) print("Pvalue - Sintética: ", calcula_pvalue(jb_sint, valores_chi2_2, pvalues_chi2_2)) # Cálculo del estadístico Jarque-Bera y el pvalue con la implementación de Scipy print("JarqueBera/Pvalue (Scipy) - Frecuencias: ", jarque_bera(FRECUENCIAS)) print("JarqueBera/Pvalue (Scipy) - Pesos: ", jarque_bera(PESOS)) print("JarqueBera/Pvalue (Scipy) - Sintética: ", jarque_bera(SINTETICA))
def jb_calculation(symbolIdx, filesIdx): # add location variables. these i have to figure how to abstract away print(symbols[symbolIdx]) procsdSymbolFolder = os.path.join(elements, symbols[symbolIdx]) print(procsdSymbolFolder) files = sorted(os.listdir(procsdSymbolFolder)) fileLocation = os.path.join(procsdSymbolFolder, files[filesIdx]) print(fileLocation) # pick the various files volume_bar_dict = open_pickle_filepath(fileLocation)[bars[0]] calendar_bar_dict = open_pickle_filepath(fileLocation)[bars[1]] usd_volume_bar_dict = open_pickle_filepath(fileLocation)[bars[2]] tick_bar_dict = open_pickle_filepath(fileLocation)[bars[3]] # get the dataframes volume_bar_df = volume_bar_dict[list(volume_bar_dict.keys())[0]] calendar_bar_df = calendar_bar_dict[list(calendar_bar_dict.keys())[0]] usd_volume_df = usd_volume_bar_dict[list(usd_volume_bar_dict.keys())[0]] tick_bar_df = tick_bar_dict[list(usd_volume_bar_dict.keys())[0]] # returns vb_ret = returns(volume_bar_df.micro_price_close).replace( [np.inf, -np.inf], 0) # volume tb_ret = returns(tick_bar_df.micro_price_close).replace([np.inf, -np.inf], 0) # tick usdvb_ret = returns(usd_volume_df.micro_price_close).dropna().replace( [np.inf, -np.inf], 0) # usd volume cb_ret = returns(calendar_bar_df.micro_price_close).dropna().replace( [np.inf, -np.inf], 0) # calendar # calculating JB statistic jb_value_tick, _ = jarque_bera(tb_ret) jb_value_vol, _ = jarque_bera(vb_ret) jb_value_dollar, _ = jarque_bera(usdvb_ret) jb_value_calendar, _ = jarque_bera(cb_ret) jb_test_df = pd.DataFrame(data={ 'jarque_bera_results': [jb_value_tick, jb_value_vol, jb_value_dollar, jb_value_calendar] }, index=['tick', 'vol', 'dollar', 'calendar']) pickle_out_returns = os.path.join( experimentsLocation, "".join( (str(symbols[symbolIdx]), "_" + str(filesIdx) + "_jb_stats.pkl"))) pickle.dump(jb_test_df, open(pickle_out_returns, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print("produced and saved JB stats for :", symbols[symbolIdx], filesIdx)
def decompose_stl(self, keyword, robust=False): """ Decomposition by STL LOESS :param robust: robust estimation orr not :param keyword: keyword to be used """ ts = self.time_series(keyword) ts_bc = self.time_series_box_cox(keyword) decomp_add = seasonal.STL(ts, robust=robust).fit() decomp_mult = seasonal.STL(ts_bc, robust=robust).fit() if stats.jarque_bera(decomp_add.resid).pvalue > stats.jarque_bera(decomp_mult.resid).pvalue: self.decomposition = decomp_add else: self.decomposition = decomp_mult return self.decomposition
def q2(): # Retorne aqui o resultado da questão 2. alpha = 0.05 amostra = get_sample(athletes, 'height', 3000) print('Retorno: ', sct.jarque_bera(amostra)) p = sct.jarque_bera(amostra)[1] if p > alpha: return True else: return False pass
def normal(x): print('Shapiro-Wilk p =', stats.shapiro(x)[1]) print('Jarque-Bera p =', stats.jarque_bera(x)[1]) print('QQ plot') qqplot(x, line='s') pyplot.show() return 0
def JBtest(x): # 样本规模n n = x.size x_ = x - x.mean() """ M2:二阶中心钜 skew 偏度 = 三阶中心矩 与 M2^1.5的比 krut 峰值 = 四阶中心钜 与 M2^2 的比 """ M2 = np.mean(x_**2) skew = np.mean(x_**3) / M2**1.5 krut = np.mean(x_**4) / M2**2 """ 计算JB统计量,以及建立假设检验 """ JB_s = n * (skew**2 / 6 + (krut - 3)**2 / 24) JB_p = 1 - stats.chi2.cdf(JB_s, df=2) print("偏度:", stats.skew(x), skew) print("峰值:", stats.kurtosis(x) + 3, krut) print("JB检验:", stats.jarque_bera(x)) res = pd.DataFrame( [['skew', 'krut', '统计量', 'Sig'], [skew, krut, JB_s, JB_p]], index=["正态性检验", 'JB test']) return res
def q2(): # Da mesma forma que sct.shapiro(), sct.jarque_bera() retorna uma tupla com dois valores (test statistic, p-value) p_value = sct.jarque_bera(height_sample)[1] alpha = 0.05 return False if p_value < alpha else True
def q2(): """ #H0: A amostra tem distribuição normal Se p-valor < alpha, rejeita-se H0 Se p-valor > alpha, não é possível rejeitar H0, a distribuição é normal """ return sct.jarque_bera(sample_height)[1] > 0.05
def q2(): # Retorne aqui o resultado da questão 2. pass # Teste de Jarque-Bera jarque = sct.jarque_bera(altura) # Comparando a p-value com a significancia de 5% return bool(jarque[1]>0.05)
def main(): # read the data data = pd.read_csv("CC GENERAL.csv") data.loc[(data['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].median() data.loc[(data['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = data['CREDIT_LIMIT'].median() data = data.drop(['CUST_ID'], 1) names = data.columns.tolist() # normalize the data scaler = MinMaxScaler() data_scaled = scaler.fit_transform(data) data_scaled = pd.DataFrame(data_scaled, columns=names) # apply Jurque-Bera Test print("Jurque-Bera Test:") for i in range(len(names)): X = data_scaled[names[i]] jb_value, p_value = jarque_bera(X) print("{} for {} feature, test value is {} and p-value is {}".format(i+1, names[i], jb_value, p_value)) print("\n") # apply Anderson Test print("Anderson Test:") for i in range(len(names)): X = data_scaled[names[i]] a = anderson(X, dist='norm') print("for {} feature, test value is {}".format(names[i], a)) print("\n")
def __analysis_index(self): index = self.get_env().query_data(Index_Data).get_data_serise() index_name = list(index.columns) index_name.remove(COM_DATE) index[index_name] = index[index_name].pct_change()/100 index[index_name] = np.log(index[index_name]+1) index = index.set_index(COM_DATE) index.index = pd.to_datetime(index.index) res = pd.DataFrame(columns = ['mean','std','skew','kurt','jarque-Bera','adf','lm']) for index_name_ in index_name: fig, ax = plt.subplots() ax.plot(index[index_name_].dropna(), label=index_name_) ax.set_xlabel('时间') ax.set_ylabel('收益率的对数') ax.set_title(index_name_+'收益率图') ax.legend() plt.savefig(os.path.join(RESULTS, index_name_+'.png')) plt.close() fig, ax = plt.subplots() ax.hist(index[index_name_].dropna(),bins =25) ax.set_xlabel('收益率范围') ax.set_ylabel('收益率的对数') ax.set_title(index_name_+'收益率图') plt.savefig(os.path.join(RESULTS, index_name_+'bar.png')) plt.close() res.loc[index_name_] = [ np.nanmean(index[index_name_].dropna()), np.nanstd(index[index_name_].dropna()), index[index_name_].dropna().skew(), index[index_name_].dropna().kurt(), stats.jarque_bera(index[index_name_].dropna())[0], adfuller(index[index_name_].dropna())[4]['5%'], q_stat(acf(index[index_name_].dropna())[1:13],len(index[index_name_].dropna()))[1][-1] ] res.to_csv(os.path.join(RESULTS,'index_info.csv'))
def get_rf(frequency='daily', descriptives=False): # Give the location of the file script_path = os.getcwd() os.chdir(script_path) # Assign spreadsheet filename to `file` file = './data/RF_' + frequency + '.csv' # Load spreadsheet into dataframe df = pd.read_csv(file, header=0, index_col=0) df.index = pd.to_datetime(df.index) rf = df.filter(items=['rf']) mktrf = df.filter(items=['mktrf']) # Compute descriptive statistics if desired if descriptives: print(rf.min()) print(rf.max()) print(rf.mean()) print(rf.var()) print(rf.skew()) print(rf.kurtosis()) print(stats.jarque_bera(rf)) return mktrf, rf
def q2(): alpha = 0.05 a = get_sample(athletes, 'height', n=3000) if (sct.jarque_bera(sct.zscore(a))[1] <= alpha): return False else: return True
def normalityCheckJB(sampleList): jbTest = stats.jarque_bera(sampleList) if jbTest[1] > 0.05: return "符合常態分配" else: return "不符合常態分配"
def normality_of_residuals_test(model): ''' Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to investigate the normality of residuals. Arg: * model - fitted OLS models from statsmodels ''' sm.ProbPlot(model.resid).qqplot(line='s') plt.title('Q-Q plot') jb = stats.jarque_bera(model.resid) sw = stats.shapiro(model.resid) ad = stats.anderson(model.resid, dist='norm') ks = stats.kstest(model.resid, 'norm') print(f'Jarque-Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}') print( f'Shapiro-Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}') print( f'Kolmogorov-Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}' ) print( f'Anderson-Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}' ) print( 'If the returned AD statistic is larger than the critical value, then for the 5% significance level, the null hypothesis that the data come from the Normal distribution should be rejected. ' )
def q2(): sample = get_sample(athletes, 'height', n=3000) stat, p = sct.jarque_bera(sample) alpha = 0.05 if p > alpha: return True return False
def q2(): # Retorne aqui o resultado da questão 2. df = get_sample(athletes, 'height', n=3000) (jb_valuefloat,pvalue) = sct.jarque_bera(df) return bool(pvalue>=0.05)
def q2(): height_sample = get_sample(athletes, 'height', 3000) if (sct.jarque_bera(height_sample)[1] > 0.05): return True else: return False
def normality_of_residuals_test(model): ''' Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to investigate the normality of residuals. Arg: * model - fitted OLS models from statsmodels ''' sm.ProbPlot(model.resid).qqplot(line='s') plt.title('Q-Q Plot') jb = stats.jarque_bera(model.resid) sw = stats.shapiro(model.resid) ad = stats.anderson(model.resid, dist='norm') ks = stats.kstest(model.resid, 'norm') print(f'Jarque_Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}') print( f'Shapiro_Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}') print( f'Kolmogorov_Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}' ) print( f'Anderson_Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}' )
def check_lr_assumptions(df, data_fe): """ prints multiple statistical tests and returns a dataframe containing residuals arguments --------- df: dataframe of truth and prediction columns labeled "truth" and "pred" data_fe: prepared features for prediction return ------ dataframe """ df['residuals'] = df['pred'] - df['truth'] print("mean of residuals:", df['residuals'].mean()) print("variance of residuals:", df['residuals'].var()) print("skewness of residuals:", stats.skew(df.residuals)) print("kurtosis of residuals:", stats.kurtosis(df.residuals)) print("kurtosis test of residuals:", stats.kurtosistest(df.residuals)) print("normal test of residuals (scipy stats):", stats.normaltest(df.residuals)) print("Jarque Bera test for normality of residuals:", stats.jarque_bera(df.residuals)) print("Breusch Pagan test for heteroscedasticity:", het_breuschpagan(df.residuals, data_fe)) return df
def normality_tests(data_values): """ :param data_values: values of returns in our case :return: print out a series of outcomes of whether the data fits a normal distribution or not! """ stat, p = shapiro(data_values) print('stat = %.3f, p = %.3f\n ' % (stat, p)) if p > 0.05: print('prob gaussian') else: print('non gaussian') stat_nt, p_nt = normaltest(data_values) print('stat = %.3f, p = %.3f\n ' % (stat_nt, p_nt)) stat_jb, p_jb = jarque_bera(data_values) print('stat = %.3f, p = %.3f\n ' % (stat_jb, p_jb)) if p_jb > 0.05: print('prob gaussian') else: print('non gaussian')
def q2(): # Retorne aqui o resultado da questão 2. sample_height = get_sample(athletes, 'height', n=3000) JB_and_pvalue_height = sct.jarque_bera(sample_height) pvalue_height = JB_and_pvalue_height[1] alpha = 0.05 return bool(pvalue_height > alpha)
def q2(): height_athletes = get_sample(athletes, 'height', 3000) jarque_p = sct.jarque_bera(height_athletes)[1] alpha = 0.05 return bool(jarque_p > alpha)
def jb_test(self, r: pd.Series, mode="stat"): self.check_instance(r, "pd.Series", "jb_test") r = r[~pd.isnull(r)] try: stat, p = sps.jarque_bera(r) except: stat, p = (np.nan, np.nan) return stat if mode == "stat" else p