def reshape_data(self): ts_value = self.input_df.T.values ts_value = ts_value.reshape(ts_value.shape[0], ts_value.shape[1], 1) scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) data_scaled = scaler.fit_transform(ts_value) data_scaled = np.nan_to_num(data_scaled) self.data_scaled = data_scaled
def _get_random_walk(): numpy.random.seed(0) # Generate a random walk time series n_ts, sz, d = 1, 100, 1 dataset = random_walks(n_ts=n_ts, sz=sz, d=d) scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) return scaler.fit_transform(dataset)
def perform_sax(dataset, gram_number, symbols, segments): scaler = TimeSeriesScalerMeanVariance( mu=0., std=np.std(dataset)) # Rescale time series dataset = scaler.fit_transform(dataset) # SAX transform sax = SymbolicAggregateApproximation(n_segments=segments, alphabet_size_avg=symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # print(pd.DataFrame(sax_dataset_inv[0])[0].value_counts()) # sax_dataset_inv = sax.fit_transform(dataset) # print(len(sax_dataset_inv[0])) # Convert result to strings df_sax = pd.DataFrame(sax_dataset_inv[0]) sax_series = df_sax[0] # Convert sax from numeric to characters sax_values = sax_series.unique() alphabet = 'abcdefghijklmnopqrstuvw' sax_dict = {x: alphabet[i] for i, x in enumerate(sax_values)} sax_list = [sax_dict[x] for x in sax_series] # Convert the list of characters to n_grams based on input parameter tri = n_grams(gram_number, sax_list) # print(Counter(tri)) return tri
def test_single_value_ts_no_nan(): X = to_time_series_dataset([[1, 1, 1, 1]]) standard_scaler = TimeSeriesScalerMeanVariance() assert np.sum(np.isnan(standard_scaler.fit_transform(X))) == 0 minmax_scaler = TimeSeriesScalerMinMax() assert np.sum(np.isnan(minmax_scaler.fit_transform(X))) == 0
def normalize(df): df_normalized = df.copy() df_normalized = df_normalized normalize = TimeSeriesScalerMeanVariance(mu=0, std=1) for col in df: df_normalized[col] = normalize.fit_transform(df_normalized[col])[0] return df_normalized
def getStdData(originData): n_paa_segments = 120 #一天分成4份,每6个小时整合为一段 paa_data = PiecewiseAggregateApproximation( n_segments=n_paa_segments).fit_transform(originData) #进行平均值归一化 scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) dataset = scaler.fit_transform(paa_data) dataset = dataset.reshape(dataset.shape[0], dataset.shape[1]) return dataset
def _transform(self, X, y=None): n_ts, sz, d = X.shape if d > 1: raise NotImplementedError("We currently don't support using " "multi-dimensional matrix profiles " "from the stumpy library.") output_size = sz - self.subsequence_length + 1 X_transformed = np.empty((n_ts, output_size, 1)) if self.implementation == "stump": if not STUMPY_INSTALLED: raise ImportError(stumpy_msg) for i_ts in range(n_ts): result = stumpy.stump(T_A=X[i_ts, :, 0].ravel(), m=self.subsequence_length) X_transformed[i_ts, :, 0] = result[:, 0].astype(np.float) elif self.implementation == "gpu_stump": if not STUMPY_INSTALLED: raise ImportError(stumpy_msg) for i_ts in range(n_ts): result = stumpy.gpu_stump(T_A=X[i_ts, :, 0].ravel(), m=self.subsequence_length) X_transformed[i_ts, :, 0] = result[:, 0].astype(np.float) elif self.implementation == "numpy": scaler = TimeSeriesScalerMeanVariance() band_width = int(np.ceil(self.subsequence_length / 4)) for i_ts in range(n_ts): segments = _series_to_segments(X[i_ts], self.subsequence_length) if self.scale: segments = scaler.fit_transform(segments) n_segments = segments.shape[0] segments_2d = segments.reshape( (-1, self.subsequence_length * d)) dists = squareform(pdist(segments_2d, "euclidean")) band = (np.tri( n_segments, n_segments, band_width, dtype=np.bool ) & ~np.tri( n_segments, n_segments, -(band_width + 1), dtype=np.bool)) dists[band] = np.inf X_transformed[i_ts] = dists.min(axis=1, keepdims=True) else: available_implementations = ["numpy", "stump", "gpu_stump"] raise ValueError( 'This "{}" matrix profile implementation is not' ' recognized. Available implementations are {}.'.format( self.implementation, available_implementations)) return X_transformed
def cor(x, y): """ Correlation-based distance (COR) between two multivariate time series given as arrays of shape (timesteps, dim) """ scaler = TimeSeriesScalerMeanVariance() x_norm = scaler.fit_transform(x) y_norm = scaler.fit_transform(y) pcc = np.mean(x_norm * y_norm) # Pearson correlation coefficients d = np.sqrt(2.0 * (1.0 - pcc + 1e-9)) # correlation-based similarities return np.sum(d)
def saa_pax(dataset, title): """ Show the graph of PAA and SAX of time series data :param dataset: time series of a stock :return: """ n_ts, sz, d = 1, 100, 1 scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(dataset) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # 1d-SAX transform n_sax_symbols_avg = 8 n_sax_symbols_slope = 8 one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform( one_d_sax.fit_transform(dataset)) plt.figure() plt.subplot(2, 2, 1) # First, raw time series plt.plot(dataset[0].ravel(), "b-") plt.title("Raw time series " + title) plt.subplot(2, 2, 2) # Second, PAA plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(paa_dataset_inv[0].ravel(), "b-") plt.title("PAA " + title) plt.subplot(2, 2, 3) # Then SAX plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(sax_dataset_inv[0].ravel(), "b-") plt.title("SAX, %d symbols" % n_sax_symbols) plt.subplot(2, 2, 4) # Finally, 1d-SAX plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(one_d_sax_dataset_inv[0].ravel(), "b-") plt.title("1d-SAX, %d symbols (%dx%d)" % (n_sax_symbols_avg * n_sax_symbols_slope, n_sax_symbols_avg, n_sax_symbols_slope)) plt.tight_layout() plt.show()
def ApplyPaa(n_paa_segments, df, ckt): circuito = ckt print("Quantidade de segmentos de PAA: {}".format(n_paa_segments)) paa = PiecewiseAggregateApproximation(n_paa_segments) scaler = TimeSeriesScalerMeanVariance() dadosPaa = df for i in range(0, len(df)): dataset = scaler.fit_transform(df[i]) dadosPaa[i] = paa.inverse_transform(paa.fit_transform(dataset))[0] dadosPaa = dadosPaa.T return dadosPaa
def check_classifiers_classes(name, classifier_orig): # Case of shapelet models if name == 'SerializableShapeletModel': raise SkipTest('Skipping check_classifiers_classes for shapelets' ' due to convergence issues...') elif name == 'ShapeletModel': X_multiclass, y_multiclass = _create_large_ts_dataset() classifier_orig = clone(classifier_orig) classifier_orig.max_iter = 1000 else: X_multiclass, y_multiclass = _create_small_ts_dataset() X_multiclass, y_multiclass = shuffle(X_multiclass, y_multiclass, random_state=7) scaler = TimeSeriesScalerMeanVariance() X_multiclass = scaler.fit_transform(X_multiclass) X_multiclass = np.reshape(X_multiclass, (X_multiclass.shape[0], X_multiclass.shape[1])) X_binary = X_multiclass[y_multiclass != 2] y_binary = y_multiclass[y_multiclass != 2] X_multiclass = pairwise_estimator_convert_X(X_multiclass, classifier_orig) X_binary = pairwise_estimator_convert_X(X_binary, classifier_orig) labels_multiclass = ["one", "two", "three"] labels_binary = ["one", "two"] y_names_multiclass = np.take(labels_multiclass, y_multiclass) y_names_binary = np.take(labels_binary, y_binary) problems = [(X_binary, y_binary, y_names_binary)] if not classifier_orig._get_tags()['binary_only']: problems.append((X_multiclass, y_multiclass, y_names_multiclass)) for X, y, y_names in problems: for y_names_i in [y_names, y_names.astype('O')]: y_ = choose_check_classifiers_labels(name, y, y_names_i) check_classifiers_predictions(X, y_, name, classifier_orig) labels_binary = [-1, 1] y_names_binary = np.take(labels_binary, y_binary) y_binary = choose_check_classifiers_labels(name, y_binary, y_names_binary) check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2], [8, 7, 6, 5, 3]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) scaler = TimeSeriesScalerMeanVariance() clf = KNeighborsTimeSeriesClassifier(metric="sax", n_neighbors=1, metric_params={'n_segments': 2}) X_transf = scaler.fit_transform(X) clf.fit(X_transf, y) assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
def _transform(self, X, y=None): n_ts, sz, d = X.shape output_size = sz - self.subsequence_length + 1 X_transformed = numpy.empty((n_ts, output_size, 1)) scaler = TimeSeriesScalerMeanVariance() for i_ts in range(n_ts): Xi = X[i_ts] elem_size = Xi.strides[0] segments = as_strided( Xi, strides=(elem_size, elem_size, Xi.strides[1]), shape=(Xi.shape[0] - self.subsequence_length + 1, self.subsequence_length, d), writeable=False) if self.scale: segments = scaler.fit_transform(segments) segments_2d = segments.reshape((-1, self.subsequence_length * d)) dists = squareform(pdist(segments_2d, "euclidean")) numpy.fill_diagonal(dists, numpy.inf) X_transformed[i_ts] = dists.min(axis=1, keepdims=True) return X_transformed
def get_distance_matrix(numpy_array): sc = TimeSeriesScalerMeanVariance() X_s = sc.fit_transform(to_time_series_dataset(numpy_array)) size = len(X_s) idx = [(i, j) for i in range(0, size) for j in range(i + 1, size)] def calc_dtw(my_idx): i, j = my_idx return dtw(X_s[i], X_s[j]) with mp.Pool(mp.cpu_count() - 1) as p: distances = p.map(calc_dtw, idx) dm = np.zeros(shape=(size, size)) for (i, j), v in zip(idx, distances): dm[i, j] = v dm[j, i] = v return dm
def approximate(self, series: np.ndarray, window: int = 1, should_fit: bool = True) -> np.ndarray: # series is already in batches debug('TSLearnApproximatorWrapper.approximate: series shape {}'.format( series.shape)) debug( 'TSLearnApproximatorWrapper.approximate: to_time_series shape {}'. format(series.shape)) ts_representation = list() debug( f'TSLearnApproximatorWrapper.approximate: param series \n{series} ' ) for segment in series: if isinstance(self.transformer, SymbolicAggregateApproximation) or isinstance( self.transformer, OneD_SymbolicAggregateApproximation): logger.info( "Scaling the data so that they consist a normal distribution." ) scaler = TimeSeriesScalerMeanVariance( mu=0., std=1.) # Rescale time series segment = scaler.fit_transform(segment) ts_representation.append(self.transformer.fit_transform(segment)) # debug('TSLearnApproximatorWrapper.approximate: ts_representation \n{}'.format(ts_representation)) debug( 'TSLearnApproximatorWrapper.approximate: ts_representation shape {}' .format(np.shape(ts_representation))) ts_representation = np.reshape( ts_representation, (np.shape(ts_representation)[0], np.shape(ts_representation)[1] * np.shape(ts_representation)[2])) debug('TSLearnApproximatorWrapper.approximate: ts_representation \n{}'. format(ts_representation)) debug( 'TSLearnApproximatorWrapper.approximate: ts_representation shape {}' .format(ts_representation.shape)) return ts_representation
def ApplyPaa(n_paa_segments,df): ''' Aplica o PAA no dataframe fornecido. :param n_paa_segments: quantidade de segmento do PAA para redução de dados :param df: dataframe com dados em que se deseja aplicar o PAA :return: df após aplicação do PAA ''' df = df.values.T.tolist() scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) dadosPaa = scaler.fit_transform(df) print("Quantidade de segmentos de PAA: {}".format(n_paa_segments)) paa = PiecewiseAggregateApproximation(n_paa_segments) dadosPaa = paa.inverse_transform(paa.fit_transform(dadosPaa)) df = pd.DataFrame() for i in range(len(dadosPaa.T)): for j in range(len(dadosPaa.T[0])): df[j] = dadosPaa.T[i][j] return df
def train_nn( dataset: str, batch_size: int, depth: int, epochs: int ) -> Tuple[CNN, Tuple[Union[np.ndarray, np.ndarray], Union[ np.ndarray, np.ndarray]], Tuple[Union[np.ndarray, np.ndarray], Union[ np.ndarray, np.ndarray]]]: experiment = Experiment(project_name="cphap", auto_output_logging=False) experiment.add_tag(dataset) experiment.add_tag("NN-depth-{}".format(depth)) (x_train, y_train), (x_test, y_test) = fetch_dataset(dataset) scaler = TimeSeriesScalerMeanVariance() x_train: np.ndarray = scaler.fit_transform(x_train) x_test: np.ndarray = scaler.transform(x_test) x_train = x_train.transpose((0, 2, 1)).astype(np.float32) x_test = x_test.transpose((0, 2, 1)).astype(np.float32) n_features = x_train.shape[1] n_targets = len(np.unique(y_train)) train_ds = get_dataset(x_train, y_train) test_ds = get_dataset(x_test, y_test) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False) model = CNN(n_features, 32, n_targets, depth=depth) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() runner = ClassificationRunner(model, optimizer, criterion, experiment) runner.add_loader("train", train_loader) runner.add_loader("test", test_loader) runner.train_config(epochs=epochs) runner.run() runner.quite() return runner.model.eval(), (x_train, x_test), (y_train, y_test)
def normalize_series(series): # Rescale series to mean 0 and unit variance scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) return scaler.fit_transform(series)
from tslearn.generators import random_walks from tslearn.preprocessing import TimeSeriesScalerMeanVariance from tslearn.piecewise import PiecewiseAggregateApproximation, _paa_to_symbols from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation np.random.seed(0) # Generate a random walk time series # n_ts, sz, d = 1, 100, 1 # dataset = random_walks(n_ts=n_ts, sz=sz, d=d) # scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series # dataset = scaler.fit_transform(dataset) # load txt data = np.loadtxt('sorted/Beef.txt', delimiter=',') scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(data[:, 1:]) rows, cols = data.shape print(rows, cols) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) sax_data = sax.fit_transform(data) print(sax_data)
# @Time : 2018/5/21 17:05 # @Author : Inkky # @Email : [email protected] ''' PAA DRAW FIG ''' from tslearn.generators import random_walks from tslearn.preprocessing import TimeSeriesScalerMeanVariance from tslearn.piecewise import PiecewiseAggregateApproximation import numpy as np import matplotlib.pyplot as plt #draw ecg200 data = np.loadtxt('data/ecg200.txt', delimiter=',') scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series data_score = scaler.fit_transform(data) rows, cols = data.shape # PAA transform (and inverse transform) of the data n_paa_segments = 1 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(data)) a = np.mean(paa_dataset_inv.ravel()) print(a) plt.figure(1) fig = plt.gcf() fig.set_size_inches(6, 3) plt.plot(data_score[2].ravel(), "b-", label='Raw', linewidth=2.5, alpha=0.6) plt.plot(paa_dataset_inv[2].ravel(), 'r-', label='PAA', linewidth=2.5) # print(data_score[2].ravel()) x_new = np.linspace(0, 50)
key=lambda x: -x[1])) # Re-sample the test and train set with same sizes and strataified if X_test is None or len(X_test) == 0: continue nr_test_samples = len(X_test) X = np.vstack((X_train, X_test)) y = np.vstack((np.reshape(y_train, (-1, 1)), np.reshape(y_test, (-1, 1)))) y = pd.Series(np.reshape(y, (-1, ))) X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=nr_test_samples) test_idx = y_test.index train_idx = y_train.index scaler = TimeSeriesScalerMeanVariance() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1])) # Map labels to [0, .., C-1] map_dict = {} for j, c in enumerate(sorted(set(y_train))): map_dict[c] = j y_train = y_train.map(map_dict).values y_test = y_test.map(map_dict).values timestamp = int(time.time()) pd.DataFrame(
('Plane', 64), ('Car', 256), ('Beef', 128), ('Coffee', 128), ('OliveOil', 256)] # We will compare the accuracies & execution times of 1-NN using: # (i) MINDIST on SAX representations, and # (ii) euclidean distance on raw values knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax') knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean') accuracies = {} times = {} for dataset, w in datasets: X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset) ts_scaler = TimeSeriesScalerMeanVariance() X_train = ts_scaler.fit_transform(X_train) X_test = ts_scaler.fit_transform(X_test) # Fit 1-NN using SAX representation & MINDIST metric_params = {'n_segments': w, 'alphabet_size_avg': 10} knn_sax = clone(knn_sax).set_params(metric_params=metric_params) start = time.time() knn_sax.fit(X_train, y_train) acc_sax = accuracy_score(y_test, knn_sax.predict(X_test)) time_sax = time.time() - start # Fit 1-NN using euclidean distance on raw values start = time.time() knn_eucl.fit(X_train, y_train) acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test)) time_euclidean = time.time() - start
# License: BSD 3 clause import matplotlib.pyplot as plt import numpy from scipy.signal import find_peaks from tslearn import metrics from tslearn.generators import random_walks from tslearn.preprocessing import TimeSeriesScalerMeanVariance numpy.random.seed(0) n_ts, sz, d = 2, 100, 1 n_repeat = 5 dataset = random_walks(n_ts=n_ts, sz=sz, d=d) scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset_scaled = scaler.fit_transform(dataset) # We repeat the long sequence multiple times to generate multiple possible # matches long_sequence = numpy.tile(dataset_scaled[1], (n_repeat, 1)) short_sequence = dataset_scaled[0] sz1 = len(long_sequence) sz2 = len(short_sequence) print('Shape long sequence: {}'.format(long_sequence.shape)) print('Shape short sequence: {}'.format(short_sequence.shape)) # Calculate the accumulated cost matrix mat = metrics.subsequence_cost_matrix(short_sequence, long_sequence)
# 3. 使用k-means之类的算法进行一次简单聚类 # 4. 在简单聚类的基础上再进行一次复杂聚类 # 5. 基线进行预测,每个残余数据对震荡幅度进行预测 # 1.原始数据异常点去除实现:对原始数据进行排序。去除范围为前1%与后1%,合计2%左右 # 如果旁边没有点,则直接删除,然后对原有数据空缺部分进行线性插值。否则进行最大最小抑制 # 时间序列:异常点排除一为方法1 # 时间序列:异常点排除方法2:对原数据进行归一化。将所有数据减去平均值的绝对值后排序,去除5%,并进行线性插值 ratio = 0.05 #异常点数量 #归一化 from tslearn.preprocessing import TimeSeriesScalerMinMax scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) stdData = scaler.fit_transform(formatted_dataset) # 为原有的均值设定为指定的方差和平均值(可能因为极端值的问题有些差别) # 将原始数据进行放缩,在原数据的基础上生成新的统一平均值数据 DistAvg = 2000 VarAvg = 15000 for i in range(len(formatted_dataset)): repres = stdData[i] formatted_dataset[i] = repres * np.sqrt(VarAvg) + DistAvg # 对于每一行的数据,得到一个绝对值排序结果,将最大的前5%排除出去。然后对空缺的位置进行线性差值 for index, row in enumerate(stdData): element = abs(row) element = element.ravel() element.sort() maxNum = element[-1 * int(ratio * len(element))]
def main(): #FOR NOAA DB influx_url = "http://localhost:8086/query?db=" + dbname + \ "&epoch=ms&q=SELECT+%22water_level%22+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms" r = requests.get(influx_url) json_dict = json.loads(r.content) data = json_dict["results"][0]["series"][0]["values"] print(data[0:5]) ## #NOTE:just for NOAA h2o_feet time_interval = data[2][0] - data[0][0] print("time interval:", time_interval) lst2 = [item[1] for item in data] n_segments = len(lst2) print(max(lst2),min(lst2)) original_data_size = len(lst2) print("original data size:", original_data_size) alphabet_size_avg = math.ceil(max(lst2)-min(lst2)) print("alphabet size avg:", alphabet_size_avg) ## a list of sample ratios. ## Want to select the min ratio within the similarity range. ratiolist = [0.025,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.6] sizelist = [] distlist = [] for ratio in ratiolist: print() print("ratio:",ratio) #generate sample data sample_size = math.floor(original_data_size * ratio) sizelist.append(sample_size) print("sample_size:",sample_size) #NOAA DB: h2o_feet sample_url = "http://localhost:8086/query?db=" + dbname + \ "&epoch=ms&q=SELECT+sample%28%22water_level%22%2C"+str(sample_size) + \ "%29+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms" r2 = requests.get(sample_url) json_dict2 = json.loads(r2.content) sampled_data = json_dict2["results"][0]["series"][0]["values"] # [[time, value], ...] sample = [item[1] for item in sampled_data] #[value,...] #fill the sample data with a linear model start_x = data[0][0] end_x = data[-1][0] current_x = start_x current_loc = 0 slope = (sampled_data[current_loc][1]-sampled_data[current_loc+2][1])\ /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0]) ##NOTE! intersection = sampled_data[current_loc][1]-slope*sampled_data[current_loc][0] sample_fit = [] end_sample_x = sampled_data[-1][0] while current_x <= end_sample_x: if current_x >= sampled_data[current_loc+1][0] and current_loc+1 < len(sampled_data)-2: ##NOTE: -2 !! CHANGE TO -1 LATER current_loc+=1 ##NOTE: +2 was just for h2o_feet if (sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) == 0: slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \ /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0]) else: slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \ /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0]) intersection = sampled_data[current_loc][1] - slope*sampled_data[current_loc][0] sample_fit.append([current_x, slope*current_x+intersection]) current_x += time_interval #1000ms #chop the original data to match the linear fit sample data. chopped_data = [] for item in data: if item[0]>= sample_fit[0][0] and item[0] <= sample_fit[-1][0]: chopped_data.append(item) print("size of chopped_data:",len(chopped_data)) chopped_lst2 = [item[1] for item in chopped_data] chopped_len = len(chopped_lst2) #build a sax model for chopped original data sax = SymbolicAggregateApproximation(chopped_len,alphabet_size_avg) scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.) sdb = scalar.fit_transform(chopped_lst2) sax_data = sax.transform(sdb) s3 = sax.fit_transform(sax_data) #build a sax model for linear-fit sampled data sample_fit_extract = [item[1] for item in sample_fit] fit_sample_data = scalar.fit_transform(sample_fit_extract) sax_sample_data = sax.transform(fit_sample_data) s4 = sax.fit_transform(sax_sample_data) #compute the distance between to dataset to calculate the similarity dist = sax.distance_sax(s3[0], s4[0]) print("distance:", dist) norm_dist = 1000*dist/chopped_len distlist.append(norm_dist) print("normalized distance: {:.4f}".format(norm_dist)) plotdist(ratiolist,distlist)
from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance from tslearn.matrix_profile import MatrixProfile import warnings warnings.filterwarnings('ignore') # Set a seed to ensure determinism numpy.random.seed(42) # Load the Trace dataset X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace") # Normalize the time series scaler = TimeSeriesScalerMeanVariance() X_train = scaler.fit_transform(X_train) # Take the first time series ts = X_train[0, :, :] # We will take the spike as a segment subseq_len = 20 start = 45 segment = ts[start:start + subseq_len] # Create our matrix profile matrix_profiler = MatrixProfile(subsequence_length=subseq_len, scale=True) mp = matrix_profiler.fit_transform([ts]).flatten() # Create a grid for our plots fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True)
# 3. 使用k-means之类的算法进行一次简单聚类 # 4. 在简单聚类的基础上再进行一次复杂聚类 # 5. 基线进行预测,每个残余数据对震荡幅度进行预测 # 1.原始数据异常点去除实现:对原始数据进行排序。去除范围为前1%与后1%,合计2%左右 # 如果旁边没有点,则直接删除,然后对原有数据空缺部分进行线性插值。否则进行最大最小抑制 # 时间序列:异常点排除一为方法1 # 时间序列:异常点排除方法2:对原数据进行归一化。将所有数据减去平均值的绝对值后排序,去除5%,并进行线性插值 ratio = 0.05 #异常点数量 #归一化 from tslearn.preprocessing import TimeSeriesScalerMinMax scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) stdData = scaler.fit_transform(formatted_dataset) # 对于每一行的数据,得到一个绝对值排序结果,将最大的前5%排除出去。然后对空缺的位置进行线性差值 for index, row in enumerate(stdData): element = abs(row) element = element.ravel() element.sort() maxNum = element[-1 * int(ratio * len(element))] del element # 对于特殊情况,进行极值抑制 # 只要能够找得到线性插值,就采用线性插值 # 具体做法:不正常点(在最前或最后),直接采用最大值抑制 # 使用一个数组进行一轮操作,将中间删除部分标出。 # 对于被删掉的部分,寻找距离其最近的,两端进行线性插值。 # previous为第一个异常点 previous = -1
def arc_length(angle_1, angle_2, r=1.): """Length of the arc between two angles (in rad) on a circle of radius r. """ # Compute the angle between the two inputs between 0 and 2*pi. theta = np.mod(angle_2 - angle_1, 2 * pi) if theta > pi: theta = theta - 2 * pi # Return the length of the arc L = r * np.abs(theta) return (L) dataset_1 = random_walks(n_ts=n_ts, sz=sz, d=1) scaler = TimeSeriesScalerMeanVariance(mu=0., std=pi) # Rescale the time series dataset_scaled_1 = scaler.fit_transform(dataset_1) # DTW using a function as the metric argument path_1, sim_1 = metrics.dtw_path_from_metric(dataset_scaled_1[0], dataset_scaled_1[1], metric=arc_length) # Example 2 : Hamming distance between 2 multi-dimensional boolean time series rw = random_walks(n_ts=n_ts, sz=sz, d=15, std=.3) dataset_2 = np.mod(np.floor(rw), 4) == 0 # DTW using one of the options of sklearn.metrics.pairwise_distances path_2, sim_2 = metrics.dtw_path_from_metric(dataset_2[0], dataset_2[1], metric="hamming")
for idxname in Stock_target.iloc[0:3].index.tolist(): pos_relatedStock.append(idxname[1]) print("Positive cov: ", pos_relatedStock) print("Num Stock: ", len(pos_relatedStock)) # Plotting Graph plt.figure() graph_idx = 0 # Transform PAA, SAX, 1d-SAX, for stockCode in pos_relatedStock: dataset = dfpivot['v_updownpercent'][stockCode] scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(dataset) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # 1d-SAX transform n_sax_symbols_avg = 8 n_sax_symbols_slope = 8
#No. of companies with >60 records listnew = df_new["name"].unique().tolist() len(listnew) print(list_new) df_red = df_new.set_index(['name', 'day']).dif.dropna() print(df_red) scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series n_paa_segments = 10 n_sax_symbols = 10 n_sax_symbols_avg = 10 n_sax_symbols_slope = 6 for i in listnew: records = len(df_red[[i]]) print("stockname" + str(i)) scaleddata = scaler.fit_transform(df_red[[i]]) #print(scaleddata) paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(scaleddata)) # SAX transform sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(scaleddata)) # 1d-SAX transform one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform( one_d_sax.fit_transform(scaleddata)) plt.figure()