def comparing_stat_patch_list(pred, y): patch1 = mpatches.Patch(color='red', label='mean:' + ('%03.6f' % np.mean(pred))) patch2 = mpatches.Patch(color='red', label='std:' + ('%03.6f' % np.std(pred))) patch3 = mpatches.Patch(color='red', label='skewness:' + ('%03.3f' % stats.skewness(pred))) patch4 = mpatches.Patch(color='red', label='kurtosis:' + ('%03.3f' % stats.kurtosis(pred))) patch5 = mpatches.Patch(color='blue', label='mean:' + ('%03.6f' % np.mean(y))) patch6 = mpatches.Patch(color='blue', label='std:' + ('%03.6f' % np.std(y))) patch7 = mpatches.Patch(color='blue', label='skewness:' + ('%03.3f' % stats.skewness(y))) patch8 = mpatches.Patch(color='blue', label='kurtosis:' + ('%03.3f' % stats.kurtosis(y))) #patch9 = mpatches.Patch(color='black', label= 'MAPE:'+ ('%03.3f' % stats.mape(pred, y))) patch10 = mpatches.Patch(color='black', label='RMSE:' + ('%03.6f' % stats.rmse(pred, y))) #plt.text(.25,.5,str(np.mean(pred))) return [ patch5, patch6, patch7, patch8, patch1, patch2, patch3, patch4, patch10 ]
def data_description(index, start, end): returns = download_data.get_returns(index, start, end) print('个数:', len(returns)) print('平均值:', np.mean(returns)) print('中位数:', np.median(returns)) print('上四分位数', sts.quantile(returns, p=0.25)) print('下四分位数', sts.quantile(returns, p=0.75)) #离散趋势的度量 print('最大值:', np.max(returns)) print('最小值:', np.min(returns)) print('极差:', np.max(returns) - np.min(returns)) print('四分位差', sts.quantile(returns, p=0.75) - sts.quantile(returns, p=0.25)) print('标准差:', np.std(returns)) print('方差:', np.var(returns)) print('离散系数:', np.std(returns) / np.mean(returns)) #偏度与峰度的度量 print('偏度:', sts.skewness(returns)) print('峰度:', sts.kurtosis(returns)) print(st.kstest(returns, 'norm')) length = len(returns) sns.distplot(returns, bins=100, label='Empirical') sns.plt.legend() sns.plt.title('Empirical') sns.plt.show()
def get_seq_feature(seq, seq_name, user_id): if not seq: print('seq is empty! : %s'%seq_name) return df = pd.DataFrame() df[seq_name + '_mean'] = [np.mean(seq)] df[seq_name + '_median'] = [np.median(seq)] df[seq_name + '_max'] = [np.max(seq)] df[seq_name + '_min'] = [np.min(seq)] df[seq_name + '_var'] = [np.var(seq)] df[seq_name + '_std'] = [np.std(seq)] if np.mean(seq) != 0: df[seq_name + '_discrete'] = [np.std(seq) / np.mean(seq)] else: df[seq_name + '_discrete'] = [np.NaN] try: df[seq_name + '_skew'] = [sts.skewness(seq)] except: df[seq_name + '_skew'] = [np.NaN] try: df[seq_name + '_kurt'] = [sts.kurtosis(seq)] except: df[seq_name + '_kurt'] = [np.NaN] df['user_id'] = [user_id] return df
def print_stats(proofs, sizes, search_sizes, total, successes, timeouts): print(f'Successes: {successes}/{total}: {int(100 * (successes / total))}%') print(f'Timeouts: {timeouts}/{total}: {int(100 * (timeouts / total))}%') mean = statistics.mean(sizes) median = statistics.median_low(sizes) most_common = collections.Counter(sizes).most_common(3) longest = max(sizes) shortest = min(sizes) # most_common = collections.Counter(sizes).most_common(1)[0][0] # for p in proofs: # print('=======') # print(p[1]) skewness = stats.skewness(sizes, mean) variance = statistics.variance(sizes, mean) excess_kurtosis = stats.excess_kurtosis(sizes, mean) if ABBREV: print(f'{mean:.2f}, {median}, {most_common},', end=' ') print(f'{skewness:.2f}, {variance:.2f}, {excess_kurtosis:.2f}') else: print(f'mean: {mean}') print(f'median: {median}') print(f'most_common: {most_common}') print(f'shortest: {shortest}') print(f'longest: {longest}') print(f'skewness: {skewness}') print(f'variance: {variance}') print(f'excess kurtosis: {excess_kurtosis}') return successes / len(implications)
def get_seq_feature(seq, seq_name, user_id): # total 11 features if not seq: print('seq is empty!') return df = pd.DataFrame() df[seq_name + '_mean'] = [np.mean(seq)] df[seq_name + '_median'] = [np.median(seq)] df[seq_name + '_max'] = [np.max(seq)] df[seq_name + '_min'] = [np.min(seq)] df[seq_name + '_var'] = [np.var(seq)] df[seq_name + '_std'] = [np.std(seq)] if len(seq) == 1: df[seq_name + '_upquantile'] = seq[0] df[seq_name + '_downquantile'] = 0 else: df[seq_name + '_upquantile'] = [sts.quantile(seq, p=0.75)] df[seq_name + '_downquantile'] = [sts.quantile(seq, p=0.25)] if np.mean(seq) != 0: df[seq_name + '_discrete'] = [np.std(seq) / np.mean(seq)] else: df[seq_name + '_discrete'] = [np.NaN] try: df[seq_name + 'skew'] = [sts.skewness(seq)] except: df[seq_name + 'skew'] = [np.NaN] try: df[seq_name + 'kurt'] = [sts.kurtosis(seq)] except: df[seq_name + 'kurt'] = [np.NaN] df['user_id'] = [user_id] return df
def extend_feature(scores): """ 特征构造 Args: scores: 原始滑动窗口获得的特征 Returns: 返回基于滑动窗口特征增加的统计特征 """ features = scores features.append(np.sum(scores)) #总数 features.append(np.mean(scores)) #平均数 features.append(np.median(scores)) #中位数 # features.append(sts.mode(scores)) #众数 features.append(sts.quantile(scores, p=0.25)) #上四分位 features.append(sts.quantile(scores, p=0.75)) #上七分位 features.append(np.max(scores)) #最大值 features.append(np.min(scores)) #最小值 features.append(np.max(scores) - np.min(scores)) #极差 features.append( sts.quantile(scores, p=0.75) - sts.quantile(scores, p=0.25)) #四分位差 features.append(np.var(scores)) #方差 features.append(np.std(scores) / np.mean(scores)) #离散系数 features.append(sts.skewness(scores)) #偏度 features.append(sts.kurtosis(scores)) #峰度 return features
def stat_patch_list(x): patch0 = mpatches.Patch(label='data num:' + str((x.size))) patch1 = mpatches.Patch(label='mean:' + ('%03.6f' % np.mean(x))) patch2 = mpatches.Patch(label='std:' + ('%03.6f' % np.std(x))) patch3 = mpatches.Patch(label='skewness:' + ('%03.3f' % stats.skewness(x))) patch4 = mpatches.Patch(label='kurtosis:' + ('%03.3f' % stats.kurtosis(x))) return [patch0, patch1, patch2, patch3, patch4]
def kusk(t_win, stockdata, indexes): ''' :param t_win: 窗口长度 :param stockdata: 股票序列 :param indexes: 遍历下标 :return: 滑动窗口得到股票序列的峰度、偏度、波动率、收益率指标 ''' sk = [] ku = [] std = [] res = [] for j in indexes: s = stockdata.iloc[-j - 2 * t_win:-j - t_win] print("s_length: ", len(s)) sk.append(sts.skewness(s)) ku.append(sts.kurtosis(s)) std.append(np.std(s)) res.append((s.iloc[-1] - s.iloc[0]) / s.iloc[0]) df = pd.DataFrame({ "时间": pd.Series(s.date), "峰度": pd.Series(sk), "偏度": pd.Series(ku), "波动率": pd.Series(std), "收益率": pd.Series(res) }) return df
def diff(file): # 绘制epoch1的正切值求角度 f = open('../filter1/filter_euclidean/' + file+'_ee1.txt', 'r') f_new = open('../filter1/filter_euclidean_skwess/' + file + '.txt', 'w') dicty = [] for line_raw in f: line = line_raw.replace('\n','').split(' ') if len(dicty) == 0: dicty = [0 for i in range(len(line)-1)] for i in range(len(line)): if i == 0 :continue if dicty[i-1] == 0: dicty[i-1] = float(line[i]) else: dicty[i - 1] += float(line[i]) # k_function = [] # for i in range(len(dicty)): # if i == 0: continue # immediate = (dicty[i]-dicty[0])/i # k_function.append(immediate) # # # 求正切de差值变化 # y = [k_function[0]] # for i in range(len(k_function) - 1): # immediate = abs(k_function[i] - k_function[i + 1]) # y.append(immediate) # y_theta = [] # for i in range(len(k_function)): # theta = math.degrees(math.atan(k_function[i])) # y_theta.append(theta) # y_theta1 = [] # for i in range(len(y_theta) - 1): # theta = abs(y_theta[i] - y_theta[i + 1]) # y_theta1.append(theta) y = [] k_function = dicty for i in range(len(k_function) - 1): immediate = k_function[i+1] - k_function[i] y.append(immediate) for i in range(len(y)): if i < 3: continue f_new.write(str(i) + '\n') y_new = y[:i] print(str(i)) #求方差 a = 0 b = 0 start = 0 vars = [] while start !=len(y_new)-2: vars.append(sts.skewness(y_new[start:])) start += 1 print(vars) length = len(vars) for i in range(length): if i == length: if vars[i] < 1: a = i + 1 else:b='' else: if vars[i] < 1 and vars[i + 1] < 1: a = i + 1 break else: a = '' end = 2 vars0 = [] while end !=len(y_new): y_end = y_new[:end] vars0.append(sts.skewness(y_end)) end += 1 print(vars0) for i in range(len(vars0)): if i ==len(vars0): if vars0[i] > 1 : b = i+2 else: b = '' else: if vars0[i] > 1 and vars0[i+1] > 1: b = i+2 break else: b = '' print(file +' ['+str(a)+' , '+str(b)+']') f_new.write(str(vars)+'\n') f_new.write(str(vars0) + '\n') f_new.write(' ['+str(a)+' , '+str(b)+']'+'\n')
import numpy as np import stats as sts a = [31, 24, 23, 25, 14, 25, 13, 12, 14, 23, 32, 34, 43, 41, 21, 23, 26, 26, 34, 42, 43, 25, 24, 23, 24, 44, 23, 14, 52,32, 42, 44, 35, 28, 17, 21, 32, 42, 12, 34] scores=np.array(a) print('總合為:',np.sum(scores)) print('筆數為:',len(scores)) print('平均值為:',np.mean(scores)) print('中位數為:',np.median(scores)) print('眾數為:',sts.mode(scores)) print('上四分位數為',sts.quantile(scores,p=0.25)) print('下四分位數為',sts.quantile(scores,p=0.75)) print('最大值:',np.max(scores)) print('最小值:',np.min(scores)) print('全距:',np.ptp(scores)) print('標準差:',np.std(scores)) print('變異數:',np.var(scores)) print('離散係數:',np.std(scores)/np.mean(scores)) print('偏態係數:',sts.skewness(scores)) print('峰態係數:',sts.kurtosis(scores))
import numpy as np import stats as sts scares = [ 31, 24, 23, 25, 14, 25, 13, 12, 14, 23, 32, 34, 43, 41, 21, 23, 26, 26, 34, 42, 43, 25, 24, 23, 24, 44, 23, 14, 52, 32, 42, 44, 35, 28, 17, 21, 32, 42, 12, 34 ] print('求和:', np.sum(scares)) print('個數:', len(scares)) print('平均值:', np.mean(scares)) print('中位數:', np.median(scares)) print('眾數:', sts.mode(scares)) print('上四分位數:', sts.quantile(scares, p=0.25)) print('下四分位數:', sts.quantile(scares, p=0.75)) print('最大值:', np.max(scares)) print('最小值:', np.min(scares)) print('極差:', np.std(scares)) print('四分位數:', sts.quantile(scares, p=0.75), sts.quantile(scares, p=0.25)) print('標準差:', np.std(scares)) print('方差', np.var(scares)) print('離散係數', np.std(scares) / np.mean(scares)) print('遍度:', sts.skewness(scares)) print('峰度:', sts.kurtosis(scares))
print('方差', df['身高'].var()) print('标准差', df['身高'].std()) print('极差', df['身高'].max() - df['身高'].min()) print('偏度', df['身高'].skew()) print('峰度', df['身高'].kurt()) import numpy as np import stats as sts scores = [1, 2, 2, 2, 5] #集中趋势的度量 print('求和:', np.sum(scores)) print('个数:', len(scores)) print('平均值:', np.mean(scores)) print('中位数:', np.median(scores)) print('众数:', sts.mode(scores)) print('上四分位数', sts.quantile(scores, p=0.25)) print('下四分位数', sts.quantile(scores, p=0.75)) #离散趋势的度量 print('最大值:', np.max(scores)) print('最小值:', np.min(scores)) print('极差:', np.max(scores) - np.min(scores)) print('四分位差', sts.quantile(scores, p=0.75) - sts.quantile(scores, p=0.25)) print('标准差:', np.std(scores)) print('方差:', np.var(scores)) print('离散系数:', np.std(scores) / np.mean(scores)) #偏度与峰度的度量 print('偏度:', sts.skewness(scores)) print('峰度:', sts.kurtosis(scores))
print('个数:', len(data)) print('平均值:', np.mean(data)) print('中位数:', np.median(data)) print('众数:', sts.mode(data)) print('上四分位数', sts.quantile(data, p=0.25)) print('下四分位数', sts.quantile(data, p=0.75)) #离散趋势的度量 print('最大值:', np.max(data)) print('最小值:', np.min(data)) print('极差:', np.max(data) - np.min(data)) print('四分位差', sts.quantile(data, p=0.75) - sts.quantile(data, p=0.25)) print('标准差:', np.std(data)) print('方差:', np.var(data)) print('变异系数:', np.std(data) / np.mean(data)) #偏度与峰度的度量 print('偏度:', sts.skewness(data)) print('峰度:', sts.kurtosis(data)) # 随机生成两个样本 x = np.random.randint(0, 9, 1000) y = np.random.randint(0, 9, 1000) # 计算平均值 mx = x.mean() my = y.mean() # 计算标准差 stdx = x.std() stdy = y.std() # 计算协方差矩阵
def skewness(self, data): print('偏度:', sts.skewness(data))
import pandas as pd import numpy as np import stats as sts #建立一個15*5的二維陣列 post = np.zeros((15, 5)) #post for i in range(15): path = 'data/post/' + str(i + 1) + '.csv' #將V前1萬5千筆讀出來並分析,skiprows為跳過第一個row V = pd.read_csv(path, skiprows=1)['V'][0:20004] post[i, 0] = np.std(V) / np.mean(V) #離散係數 post[i, 1] = np.max(V) - np.min(V) #極差 post[i, 2] = sts.skewness(V) #偏度 post[i, 3] = sts.kurtosis(V) #峰度 #post 為1 post[i, 4] = '1' pre = np.zeros((15, 5)) #pre for i in range(15): path = 'data/pre/' + str(i + 1) + '.csv' #將V前1萬5千筆讀出來並分析,skiprows為跳過第一個row V = pd.read_csv(path, skiprows=1)['V'][0:20004] pre[i, 0] = np.std(V) / np.mean(V) #離散係數 pre[i, 1] = np.max(V) - np.min(V) #極差 pre[i, 2] = sts.skewness(V) #偏度 pre[i, 3] = sts.kurtosis(V) #峰度 #pre 為0 pre[i, 4] = '0'
def Ts_skewness(x, n): return stats.skewness(x[-n:])
# -*- coding: utf-8 -*- """ @author: Daniel @contact: [email protected] @file: skew_learn.py @time: 2017/7/25 9:10 """ import stats test = stats.skewness([1.25, 1.5, 1.5, 1.75, 1.75, 2.5, 2.75, 4.5]) print(test)
total1 = stats.total(list1) total2 = stats.total(list2) mean1 = stats.mean(list1) mean2 = stats.mean(list2) mode1 = stats.mode(list1) mode2 = stats.mode(list2) median1 = stats.median(list1) median2 = stats.median(list2) variance1 = stats.variance(list1) variance2 = stats.variance(list2) standard_deviation1 = stats.SD(list1) standard_deviation2 = stats.SD(list2) covariance_pop = stats.covariance(list1, list2) covariance_sample = stats.covariance(list1, list2, True) correlation = stats.correlation(list1, list2) skewness_pop1 = stats.skewness(list1) skewness_pop2 = stats.skewness(list2) skewness_sample1 = stats.skewness(list1, True) skewness_sample2 = stats.skewness(list2, True) kurtosis_pop1 = stats.kurtosis(list1) kurtosis_pop2 = stats.kurtosis(list2) kurtosis_sample1 = stats.kurtosis(list1, True) kurtosis_sample2 = stats.kurtosis(list2, True) print("Total1:", total1) print("Total2:", total2) print("Mean1:", mean1) print("Mean2", mean2) print("Mode1:", mode1) print("Mode2:", mode2) print("Median1:", median1)