def genListPAA(instances_nor, windowSize, timestamp): paa = PiecewiseAggregateApproximation(n_segments=windowSize) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(instances_nor)) return { "sketchInstances": list(paa_dataset_inv[0].ravel()), "timestamp": timestamp }
def saa_pax(dataset, title): """ Show the graph of PAA and SAX of time series data :param dataset: time series of a stock :return: """ n_ts, sz, d = 1, 100, 1 scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(dataset) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # 1d-SAX transform n_sax_symbols_avg = 8 n_sax_symbols_slope = 8 one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform( one_d_sax.fit_transform(dataset)) plt.figure() plt.subplot(2, 2, 1) # First, raw time series plt.plot(dataset[0].ravel(), "b-") plt.title("Raw time series " + title) plt.subplot(2, 2, 2) # Second, PAA plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(paa_dataset_inv[0].ravel(), "b-") plt.title("PAA " + title) plt.subplot(2, 2, 3) # Then SAX plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(sax_dataset_inv[0].ravel(), "b-") plt.title("SAX, %d symbols" % n_sax_symbols) plt.subplot(2, 2, 4) # Finally, 1d-SAX plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(one_d_sax_dataset_inv[0].ravel(), "b-") plt.title("1d-SAX, %d symbols (%dx%d)" % (n_sax_symbols_avg * n_sax_symbols_slope, n_sax_symbols_avg, n_sax_symbols_slope)) plt.tight_layout() plt.show()
def ApplyPaa(n_paa_segments, df, ckt): circuito = ckt print("Quantidade de segmentos de PAA: {}".format(n_paa_segments)) paa = PiecewiseAggregateApproximation(n_paa_segments) scaler = TimeSeriesScalerMeanVariance() dadosPaa = df for i in range(0, len(df)): dataset = scaler.fit_transform(df[i]) dadosPaa[i] = paa.inverse_transform(paa.fit_transform(dataset))[0] dadosPaa = dadosPaa.T return dadosPaa
def ApplyPaa(n_paa_segments,df): ''' Aplica o PAA no dataframe fornecido. :param n_paa_segments: quantidade de segmento do PAA para redução de dados :param df: dataframe com dados em que se deseja aplicar o PAA :return: df após aplicação do PAA ''' df = df.values.T.tolist() scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) dadosPaa = scaler.fit_transform(df) print("Quantidade de segmentos de PAA: {}".format(n_paa_segments)) paa = PiecewiseAggregateApproximation(n_paa_segments) dadosPaa = paa.inverse_transform(paa.fit_transform(dadosPaa)) df = pd.DataFrame() for i in range(len(dadosPaa.T)): for j in range(len(dadosPaa.T[0])): df[j] = dadosPaa.T[i][j] return df
# Plotting Graph plt.figure() graph_idx = 0 # Transform PAA, SAX, 1d-SAX, for stockCode in pos_relatedStock: dataset = dfpivot['v_updownpercent'][stockCode] scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(dataset) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # 1d-SAX transform n_sax_symbols_avg = 8 n_sax_symbols_slope = 8 one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform(
# 然后再进行层次聚类 可以考虑自己实现 # 问题:没有本质区别 # 加速想法二:使用rank-base对一天的数据进行处理后直接Kmeans分100类,再对每一类进行区间聚类 ################################################################# # 初始,根据原始数据计算新数据 # 从paa这里 # 需要在聚类前将训练数据划分完毕。ratio必须要精心选择使得paa的值能够成为整数 ratio = 0.9 n_paa_segments = 18 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) originData = stdData[:, :int(ratio * stdData.shape[1])] #训练部分已经知道的原始数据 paa_mid = paa.fit_transform(originData) paa_mid = paa_mid.reshape(paa_mid.shape[0], paa_mid.shape[1]) baseData = paa.inverse_transform(paa_mid) #提取基线数据 restData = originData - baseData # 计算得到残差数据 # 模式提取(直接加和取平均后求rank-base处理,或者再做标准化进行SAX处理) # 初步想法:将每天24小时的流量重复叠加取平均,进行rank-base处理,然后跑MSE用Kmeans进行100聚类 # 想法二:在自己 # 对于100聚类中的每个聚类,再跑层次聚类进行细分,最小调到1。聚类结果衡量用类内最大相似度来进行衡量(有多不相近) # 使用SAX的dayPattern # 做法01:使用SAX提取前三天的残差信息,进行20聚类。对每个聚类内部跑complete,0.5的层次聚类。考虑到500量级要跑3分钟,平均大约是一个小时。 from sklearn.cluster import AgglomerativeClustering import time dayPattern = [] for index in range(restData.shape[0]): cuData = restData[index].ravel()
EDist_train = [] for i in range(len(y_train)): for j in range(len(y_train)): dist1 = np.sqrt( np.sum((np.array(X_train[i, :]) - np.array(X_train[j, :]))**2)) EDist_train.append(dist1) EDist_train = np.array(EDist_train) EDist_train.resize(y_train.shape[0], int(len(EDist_train) / y_train.shape[0])) EDist_test = np.array(EDist_test) EDist_test.resize(y_test.shape[0], int(len(EDist_test) / y_test.shape[0])) #PAA transform + PAA feature extraction paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) Xtrain_paa = paa.inverse_transform(paa.fit_transform(X_train)) Xtest_paa = paa.inverse_transform(paa.fit_transform(X_test)) PAA_test = Xtest_paa[:, :, 0] PAA_train = Xtrain_paa[:, :, 0] ''' #PAA distance calculation PAADist_train = [] PAADist_test = [] for i in range(len(y_train)): for j in range(len(y_train)): dist3 = paa.distance(Xtrain_paa[i,:],Xtest_paa[j,:]) PAADist_train.append(dist3)
print(list_new) df_red = df_new.set_index(['name', 'day']).dif.dropna() print(df_red) scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series n_paa_segments = 10 n_sax_symbols = 10 n_sax_symbols_avg = 10 n_sax_symbols_slope = 6 for i in listnew: records = len(df_red[[i]]) print("stockname" + str(i)) scaleddata = scaler.fit_transform(df_red[[i]]) #print(scaleddata) paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(scaleddata)) # SAX transform sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(scaleddata)) # 1d-SAX transform one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform( one_d_sax.fit_transform(scaleddata)) plt.figure() # First, raw time series plt.subplot(2, 2, 1) plt.plot(scaleddata[0].ravel(), "b-")
# add columns' name df_price = pd.DataFrame(df_price, columns = day_features) dataset = df_price.values print("price feature sample: ") print(df_price.head()) # PAA transformation # PAA transform (and inverse transform) of the data n_paa_segments = 3 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_list = [] for item in df_price.values: item = item.reshape((1,5,1)) paa_price_inv = paa.inverse_transform(paa.fit_transform(item)) paa_list.append(paa_price_inv) paa_array = np.array(paa_list) paa_data = paa_array.reshape(1904, 5) paa_df = pd.DataFrame(paa_data, columns = day_features) print("save time series data after PAA") paa_df.to_csv("./paa_stock_data_time_series.csv", sep=',', encoding='utf-8') print("PAA sample: ") print(paa_df.head()) n_sax_symbols = 3 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))
stdData[index][vi] = maxNum # 2.对去除后的数据进行归一化处理 #再进行一次归一化 from tslearn.preprocessing import TimeSeriesScalerMinMax scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) originStdData = stdData # 保存,为了日后恢复 stdData = scaler.fit_transform(stdData) # 3.然后进行PAA处理,得到基线和残余值 from tslearn.piecewise import PiecewiseAggregateApproximation n_paa_segments = 20 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_mid = paa.fit_transform(stdData) paa_inv = paa.inverse_transform(paa_mid) paa_inv = paa_inv.reshape(paa_inv.shape[0],paa_inv.shape[1]) # 4.对PAA后的数据进行简单k-means,聚类数量不超过10,分数按照CH分数判断,选出最大的 # 再进行rank-base处理,然后做简单聚类 from sklearn.cluster import MiniBatchKMeans,KMeans,DBSCAN,SpectralClustering,Birch from sklearn.metrics import calinski_harabasz_score,davies_bouldin_score n_cluster = 1000 s = time.time() km = KMeans(n_clusters = n_cluster,random_state = 0) y_pre = km.fit_predict(paa_inv) e = time.time() print(e-s,"s") print(calinski_harabasz_score(paa_inv,y_pre))
import matplotlib.pyplot as plt import pandas as pd import numpy as np from tslearn.piecewise import PiecewiseAggregateApproximation from tslearn.piecewise import SymbolicAggregateApproximation url ="C:/Users/Βασίλης/IdeaProjects/MyThesisApp/Data sets/Total_Vehicle_Sales.csv" df = pd.read_csv(url) series = np.array(df.Value) print(series) n_paa_segments = 4 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(series)) plt.plot(series.ravel(), "b-", alpha=0.4) plt.plot(paa_dataset_inv.ravel(), "r-") n_sax_symbols = 4 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) print(sax) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(series)) print(sax_dataset_inv.ravel()) plt.plot(sax_dataset_inv.ravel(), "y-") plt.title("SAX, %d symbols" % n_sax_symbols) plt.show()
duration = 60 listFile = ut_lc.getListLight(height=height,duration=duration) data = ut_lc.getDataFromFile(fileName=listFile[339],height=height,duration=duration) lc_nor = TimeSeriesScalerMeanVariance(mu=0.,std=1.).fit_transform([data['instances']]) # data = ut_mdf.getDataFromFile("light_curve_Gaia-DR2_51856511715955968_date20191130") # data = ut_mdf.getDataFromFile("light_curve_Gaia-DR2_602712283908074752_date20200130") # lc_nor = TimeSeriesScalerMeanVariance(mu=0.,std=1.).fit_transform([data['instances']]) timestamps = data["timestamp"] # PAA transform (and inverse transform) of the data n_paa_segments = 8 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(lc_nor)) # SAX transform n_sax_symbols = 25 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(lc_nor)) # 1d-SAX transform n_sax_symbols_avg = 5 n_sax_symbols_slope = 5 one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) transformed_data = one_d_sax.fit_transform(lc_nor)