def perform_sax(dataset, gram_number, symbols, segments): scaler = TimeSeriesScalerMeanVariance( mu=0., std=np.std(dataset)) # Rescale time series dataset = scaler.fit_transform(dataset) # SAX transform sax = SymbolicAggregateApproximation(n_segments=segments, alphabet_size_avg=symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # print(pd.DataFrame(sax_dataset_inv[0])[0].value_counts()) # sax_dataset_inv = sax.fit_transform(dataset) # print(len(sax_dataset_inv[0])) # Convert result to strings df_sax = pd.DataFrame(sax_dataset_inv[0]) sax_series = df_sax[0] # Convert sax from numeric to characters sax_values = sax_series.unique() alphabet = 'abcdefghijklmnopqrstuvw' sax_dict = {x: alphabet[i] for i, x in enumerate(sax_values)} sax_list = [sax_dict[x] for x in sax_series] # Convert the list of characters to n_grams based on input parameter tri = n_grams(gram_number, sax_list) # print(Counter(tri)) return tri
def transform(self, data=None): sax = SymbolicAggregateApproximation(n_segments=self.n_paa, alphabet_size_avg=self.n_sax) self.trans_dataset = sax.fit_transform(self.norm_dataset) if data == None: self.invTrans_dataset = sax.inverse_transform(self.trans_dataset) else: self.invTrans_dataset = sax.inverse_transform(data)
def genListSAX(instances_nor, windowSize, timestamp, n_sax_symbols=25): sax = SymbolicAggregateApproximation(n_segments=windowSize, alphabet_size_avg=n_sax_symbols) sax_result = sax.fit_transform(instances_nor) sax_dataset_inv = sax.inverse_transform(sax_result) return { "sketchInstances": list(sax_dataset_inv[0].ravel()), "timestamp": timestamp }
def _sax_preprocess(self, X, n_segments=10, alphabet_size_avg=4): # Now SAX-transform the time series if not hasattr(self, '_sax') or self._sax is None: self._sax = SymbolicAggregateApproximation( n_segments=n_segments, alphabet_size_avg=alphabet_size_avg) X = to_time_series_dataset(X) X = self._sax.fit_transform(X) return X
def saa_pax(dataset, title): """ Show the graph of PAA and SAX of time series data :param dataset: time series of a stock :return: """ n_ts, sz, d = 1, 100, 1 scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(dataset) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # 1d-SAX transform n_sax_symbols_avg = 8 n_sax_symbols_slope = 8 one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform( one_d_sax.fit_transform(dataset)) plt.figure() plt.subplot(2, 2, 1) # First, raw time series plt.plot(dataset[0].ravel(), "b-") plt.title("Raw time series " + title) plt.subplot(2, 2, 2) # Second, PAA plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(paa_dataset_inv[0].ravel(), "b-") plt.title("PAA " + title) plt.subplot(2, 2, 3) # Then SAX plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(sax_dataset_inv[0].ravel(), "b-") plt.title("SAX, %d symbols" % n_sax_symbols) plt.subplot(2, 2, 4) # Finally, 1d-SAX plt.plot(dataset[0].ravel(), "b-", alpha=0.4) plt.plot(one_d_sax_dataset_inv[0].ravel(), "b-") plt.title("1d-SAX, %d symbols (%dx%d)" % (n_sax_symbols_avg * n_sax_symbols_slope, n_sax_symbols_avg, n_sax_symbols_slope)) plt.tight_layout() plt.show()
def discretize(raw_signal, window_size, paa_segments, alphabet_size): sax = SymbolicAggregateApproximation(n_segments=paa_segments, alphabet_size_avg=alphabet_size) discrete_signal = [] num = len(raw_signal)//window_size for i in range(num): raw_data = raw_signal[i*window_size : (i+1)*window_size] disc = sax.inverse_transform(sax.fit_transform(raw_data)) discrete_signal.append(np.squeeze(disc)) discrete_signal = [x for sublist in discrete_signal for x in sublist] return discrete_signal
def test_serialize_sax(): n_paa_segments = 10 n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) _check_not_fitted(sax) X = _get_random_walk() sax.fit(X) _check_params_predict(sax, X, ['transform'])
def test_sax_scale(): n, sz, d = 10, 10, 3 rng = np.random.RandomState(0) X = rng.rand(n, sz, d) y = rng.choice([0, 1], size=n) sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2, scale=True) sax.fit(X) np.testing.assert_array_almost_equal(X, sax._unscale(sax._scale(X))) np.testing.assert_array_almost_equal(np.zeros((d, )), sax._scale(X).reshape((-1, d)).mean()) np.testing.assert_array_almost_equal(np.ones((d, )), sax._scale(X).reshape((-1, d)).std()) # Case of kNN-SAX knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax", metric_params={"scale": True}) knn_sax.fit(X, y) X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X)) np.testing.assert_array_almost_equal(X, X_scale_unscale) knn_sax.predict(X)
def build_tslearn_sax(n_paa_segments=50, n_sax_symbols=50, supports_approximation=True): sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) return TSLearnTransformerWrapper( sax, supports_approximation=supports_approximation)
def sax_sim_matrix(df: np.ndarray, word_len, alphabet_len): ''' Computes the sax distance for a series, with specified alphabet length and word length ''' sax = SymbolicAggregateApproximation(word_len, alphabet_len) sax.fit(df) n_series = df.shape[0] sim_matrix = np.zeros((n_series, n_series)) for i in range(n_series): for j in range(n_series): sim_matrix[i][j] = sax.distance(df[i], df[j]) return sim_matrix
def sax_similarity(data, seq_len): from tslearn.piecewise import SymbolicAggregateApproximation print('|--- Calculating the pairwise distance!') ppa_segmet = int(data.shape[0] / seq_len) sax_ins = SymbolicAggregateApproximation(n_segments=ppa_segmet, alphabet_size_avg=10) sax_repre = sax_ins.fit_transform(np.transpose(data)) sax_mx_dist = np.zeros(shape=(data.shape[1], data.shape[1])) for i in range(data.shape[1]): for j in range(data.shape[1]): sax_mx_dist[i, j] = sax_ins.distance_sax(sax_repre[i], sax_repre[j]) return sax_mx_dist
class SAXStateRecognition(BaseMLModelTemplate): def build_model(self, **kwargs): self.his_len = kwargs['his_len'] self.segment_dim = kwargs['segment_dim'] self.model_obj = SymbolicAggregateApproximation( n_segments=self.his_len, alphabet_size_avg=self.param.n_state) def fit(self, x, y=None): self.store(self.param.model_save_path) def predict(self, x): self.restore(self.param.model_save_path) sax_dataset_inv = self.model_obj.inverse_transform( self.model_obj.fit_transform(x)) uniques = sorted(np.unique(sax_dataset_inv)) print('sax numbers:', len(uniques)) state_pattern = np.eye(len(uniques)) state_proba = np.zeros( [x.shape[0], self.his_len, len(uniques)], dtype=np.float) tmpstates = np.reshape(sax_dataset_inv, [-1, self.his_len, self.segment_dim]) for i in range(tmpstates.shape[0]): for j in range(tmpstates.shape[1]): index = uniques.index(tmpstates[i, j, 0]) state_proba[i, j, index] = tmpstates[i, j, 0] return np.reshape(state_proba, [-1, self.his_len, self.param.n_state]).astype( np.float32), np.array(state_pattern, dtype=np.float32) def store(self, path, **kwargs): save_model_name = "sax_{}_{}.state_model".format( self.param.data_name, self.param.n_state) joblib.dump(self.model_obj, os.path.join(path, save_model_name)) def restore(self, path, **kwargs): save_model_name = "sax_{}_{}.state_model".format( self.param.data_name, self.param.n_state) self.model_obj = joblib.load(os.path.join(path, save_model_name))
def discretize_series(series_raw, n_sax_symbols=6, hours_in_segment=4, inverse_transform=False): # Copy series series = series_raw.copy() # Normalize / rescale series data = normalize_series(series) # Determine PAA segment length for # of hours in segment n_paa_segments = __get_n_paa_segments(len(series), hours_in_segment) # SAX (and PAA) transform sax = SymbolicAggregateApproximation(alphabet_size_avg=n_sax_symbols, n_segments=n_paa_segments) model = sax.fit(data) transformed = model.transform(data) # (Optional) Transform discrete samples back to time series if inverse_transform: transformed = model.inverse_transform(transformed) return transformed, model
def test_sax(): unfitted_sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2) data = [[-1., 2., 0.1, -1., 1., -1.], [1., 3.2, -1., -3., 1., -1.]] np.testing.assert_raises(NotFittedError, unfitted_sax.distance, data[0], data[1]) sax_est_no_scale = unfitted_sax sax_est_scale = clone(sax_est_no_scale) print(sax_est_scale.set_params) sax_est_scale.set_params(scale=True) n, sz, d = 2, 10, 3 rng = np.random.RandomState(0) X = rng.randn(n, sz, d) for sax_est in [sax_est_no_scale, sax_est_scale]: sax_repr = sax_est.fit_transform(X) np.testing.assert_allclose( sax_est.distance(X[0], X[1]), sax_est.distance_sax(sax_repr[0], sax_repr[1]))
class KNeighborsTimeSeriesMixin(): """Mixin for k-neighbors searches on Time Series.""" def _sax_preprocess(self, X, n_segments=10, alphabet_size_avg=4, scale=False): # Now SAX-transform the time series if not hasattr(self, '_sax') or self._sax is None: self._sax = SymbolicAggregateApproximation( n_segments=n_segments, alphabet_size_avg=alphabet_size_avg, scale=scale ) X = to_time_series_dataset(X) X_sax = self._sax.fit_transform(X) return X_sax def _get_metric_params(self): if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "gamma_sdtw" in metric_params.keys(): metric_params["gamma"] = metric_params["gamma_sdtw"] del metric_params["gamma_sdtw"] if "n_jobs" in metric_params.keys(): del metric_params["n_jobs"] if "verbose" in metric_params.keys(): del metric_params["verbose"] return metric_params def _precompute_cross_dist(self, X, other_X=None): if other_X is None: other_X = self._ts_fit self._ts_metric = self.metric self.metric = "precomputed" metric_params = self._get_metric_params() X = check_array(X, allow_nd=True, force_all_finite=False) X = to_time_series_dataset(X) if self._ts_metric == "dtw": X_ = cdist_dtw(X, other_X, n_jobs=self.n_jobs, **metric_params) elif self._ts_metric == "ctw": X_ = cdist_ctw(X, other_X, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, other_X, **metric_params) elif self._ts_metric == "sax": X = self._sax_preprocess(X, **metric_params) X_ = cdist_sax(X, self._sax.breakpoints_avg_, self._sax._X_fit_dims_[1], other_X, n_jobs=self.n_jobs) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) return X_ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_ts, sz, d) The query time series. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the distance to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ self_neighbors = False if n_neighbors is None: n_neighbors = self.n_neighbors if X is None: X = self._X_fit self_neighbors = True if self.metric == "precomputed": full_dist_matrix = X else: if X.ndim == 2: # sklearn-format case X = X.reshape((X.shape[0], -1, self._d)) fit_X = self._X_fit.reshape((self._X_fit.shape[0], -1, self._d)) elif hasattr(self, '_ts_fit') and self._ts_fit is not None: fit_X = self._ts_fit else: fit_X = self._X_fit if (self.metric in TSLEARN_VALID_METRICS or self.metric in [cdist_dtw, cdist_ctw, cdist_soft_dtw, cdist_sax]): full_dist_matrix = self._precompute_cross_dist(X, other_X=fit_X) elif self.metric in ["euclidean", "sqeuclidean", "cityblock"]: full_dist_matrix = scipy_cdist(X.reshape((X.shape[0], -1)), fit_X.reshape((fit_X.shape[0], -1)), metric=self.metric) else: raise ValueError("Unrecognized time series metric string: %s " "(should be one of 'dtw', 'softdtw', " "'sax', 'euclidean', 'sqeuclidean' " "or 'cityblock')" % self.metric) # Code similar to sklearn (sklearn/neighbors/base.py), to make sure # that TimeSeriesKNeighbor~(metric='euclidean') has the same results as # feeding a distance matrix to sklearn.KNeighbors~(metric='euclidean') kbin = min(n_neighbors - 1, full_dist_matrix.shape[1] - 1) # argpartition will make sure the first `kbin` entries are the # `kbin` smallest ones (but in arbitrary order) --> complexity: O(n) ind = numpy.argpartition(full_dist_matrix, kbin, axis=1) if self_neighbors: ind = ind[:, 1:] if n_neighbors > full_dist_matrix.shape[1]: n_neighbors = full_dist_matrix.shape[1] ind = ind[:, :n_neighbors] n_ts = X.shape[0] sample_range = numpy.arange(n_ts)[:, None] # Sort the `kbin` nearest neighbors according to distance ind = ind[ sample_range, numpy.argsort(full_dist_matrix[sample_range, ind])] dist = full_dist_matrix[sample_range, ind] if hasattr(self, '_ts_metric'): self.metric = self._ts_metric if return_distance: return dist, ind else: return ind
dist3 = paa.distance(Xtrain_paa[i,:],Xtest_paa[j,:]) PAADist_train.append(dist3) for i in range(len(y_test)): for j in range(len(y_train)): dist4 = paa.distance(Xtest_paa[i,:],Xtrain_paa[j,:]) PAADist_test.append(dist4) PAADist_train = np.array(PAADist_train) PAADist_train.resize(y_train.shape[0],int(len(PAADist_train)/y_train.shape[0])) PAADist_test = np.array(PAADist_test) PAADist_test.resize(y_test.shape[0],int(len(PAADist_test)/y_test.shape[0])) ''' #SAX Transform + SAX feature extraction sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) Xtrain_sax = sax.inverse_transform(sax.fit_transform(X_train)) Xtest_sax = sax.inverse_transform(sax.fit_transform(X_test)) SAX_test = Xtest_sax[:, :, 0] SAX_train = Xtrain_sax[:, :, 0] ''' #SAX distance calculation SAXDist_train = [] SAXDist_test = [] for i in range(len(y_train)): for j in range(len(y_train)): dist3 = sax.distance(Xtrain_sax[i,:],Xtest_sax[j,:]) SAXDist_train.append(dist3)
def fit(self, x, y=None): sax = SymbolicAggregateApproximation(n_segments=self.hislen, alphabet_size_avg=self.state_num) joblib.dump(sax, self.modelpath / 'states.m')
dataset = [] with open('output.ou') as f: #aqui eu só carrego os valores do meu dataset for linha in f: linha = linha.strip() if linha: valores = linha.split(',') a,b = int(valores[0]), float(valores[1]) dataset.append(b) self.dataset = dataset[:] #gambito do welsu pra não passar por referência # exit = transform(300,300,dataset) # print exit # print len(exit[0]) np.set_printoptions(threshold='nan') trans = Transform(300,300) trans.read() # trans.norm() # trans.transform() # trans.norm-(0) # print trans.dataset # print trans.invTrans_dataset[0] sax = SymbolicAggregateApproximation(n_segments=300, alphabet_size_avg=300) trans.norm() aux = sax.fit_transform(trans.dataset) aux1 = sax.inverse_transform(aux) print trans.dataset print aux print aux1
# Nearest neighbor search knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw") knn.fit(X_train, y_train) dists, ind = knn.kneighbors(X_test) print("1. Nearest neighbour search") print("Computed nearest neighbor indices (wrt DTW)\n", ind) print("First nearest neighbor class:", y_test[ind[:, 0]]) # Nearest neighbor classification knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n2. Nearest neighbor classification using DTW") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification with a different metric (Euclidean distance) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n3. Nearest neighbor classification using L2") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification based on SAX representation sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") pipeline_model = Pipeline(steps=[('sax', sax_trans), ('knn', knn_clf)]) pipeline_model.fit(X_train, y_train) predicted_labels = pipeline_model.predict(X_test) print("\n4. Nearest neighbor classification using SAX+MINDIST") print("Correct classification rate:", accuracy_score(y_test, predicted_labels))
def main(): # fetch original data #for test_quarter db ## influx_url = "http://localhost:8086/query?db=" + dbname + \ ## "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546329600000ms+and+time+%3C%3D+1546329900000ms" #FOR NOAA DB ## influx_url = "http://localhost:8086/query?db=" + dbname + \ ## "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1439856000000ms+and+time+%3C%3D+1439992520000ms+and%28%22location%22+%3D+%27santa_monica%27%29" # For test3 influx_url = "http://localhost:8086/query?db=" + dbname + \ "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546355705400ms+and+time+%3C%3D+1548969305400ms" r = requests.get(influx_url) json_dict = json.loads(r.content) data = json_dict["results"][0]["series"][0]["values"] ## print(data[0]) ## print(data[1]) time_interval = data[1][0] - data[0][0] # consistant time interval print("time interval: ", time_interval) lst2 = [item[1] for item in data] n_segments = len(lst2) print("original data size", len(lst2)) alphabet_size_avg = 20 #generate sample data sample_size = 20 ## sample_url = "http://localhost:8086/query?db="+dbname+\ ## "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\ ## "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546329600000ms+and+time+%3C%3D+1546329900000ms" # test3 sample (sin pattern) sample_url = "http://localhost:8086/query?db="+dbname+\ "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\ "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546355705400ms+and+time+%3C%3D+1548969305400ms" ## sample_url = "http://localhost:8086/query?db=" + dbname + \ ## "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\ ## "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1439856000000ms+and+time+%3C%3D+1442612520000ms+and%28%22location%22+%3D+%27santa_monica%27%29" r2 = requests.get(sample_url) json_dict2 = json.loads(r2.content) sampled_data = json_dict2["results"][0]["series"][0][ "values"] # [[time, value], ...] print("sample length") print(len(sampled_data)) sample = [item[1] for item in sampled_data] #[value,...] #fill the sample data with a linear model start_x = data[0][0] end_x = data[-1][0] current_x = start_x current_loc = 0 slope = (sampled_data[current_loc][1]-sampled_data[current_loc+1][1])\ /(sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) intersection = sampled_data[current_loc][ 1] - slope * sampled_data[current_loc][0] sample_fit = [] end_sample_x = sampled_data[-1][0] while current_x <= end_sample_x: if current_x >= sampled_data[ current_loc + 1][0] and current_loc + 1 < len(sampled_data) - 1: current_loc += 1 slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \ /(sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) intersection = sampled_data[current_loc][ 1] - slope * sampled_data[current_loc][0] sample_fit.append([current_x, slope * current_x + intersection]) current_x += time_interval #1000ms #chop the original data to match the linear fit sample data. chopped_data = [] for item in data: if item[0] >= sample_fit[0][0] and item[0] <= sample_fit[-1][0]: chopped_data.append(item) print("len") print(len(sample_fit), len(chopped_data)) chopped_lst2 = [item[1] for item in chopped_data] chopped_len = len(chopped_lst2) #build a sax model for chopped original data sax = SymbolicAggregateApproximation(chopped_len, alphabet_size_avg) scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.) sdb = scalar.fit_transform(chopped_lst2) sax_data = sax.transform(sdb) s3 = sax.fit_transform(sax_data) #build a sax model for linear-fit sampled data sample_fit_extract = [item[1] for item in sample_fit] fit_sample_data = scalar.fit_transform(sample_fit_extract) sax_sample_data = sax.transform(fit_sample_data) s4 = sax.fit_transform(sax_sample_data) #compute the distance between to dataset to calculate the similarity print("distance") dist = sax.distance_sax(s3[0], s4[0]) print(dist) print("normalized distance") print(dist / chopped_len) #plot the three dataset plot(sample_fit, sampled_data, lst2)
def main(): #FOR NOAA DB influx_url = "http://localhost:8086/query?db=" + dbname + \ "&epoch=ms&q=SELECT+%22water_level%22+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms" r = requests.get(influx_url) json_dict = json.loads(r.content) data = json_dict["results"][0]["series"][0]["values"] print(data[0:5]) ## #NOTE:just for NOAA h2o_feet time_interval = data[2][0] - data[0][0] print("time interval:", time_interval) lst2 = [item[1] for item in data] n_segments = len(lst2) print(max(lst2),min(lst2)) original_data_size = len(lst2) print("original data size:", original_data_size) alphabet_size_avg = math.ceil(max(lst2)-min(lst2)) print("alphabet size avg:", alphabet_size_avg) ## a list of sample ratios. ## Want to select the min ratio within the similarity range. ratiolist = [0.025,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.6] sizelist = [] distlist = [] for ratio in ratiolist: print() print("ratio:",ratio) #generate sample data sample_size = math.floor(original_data_size * ratio) sizelist.append(sample_size) print("sample_size:",sample_size) #NOAA DB: h2o_feet sample_url = "http://localhost:8086/query?db=" + dbname + \ "&epoch=ms&q=SELECT+sample%28%22water_level%22%2C"+str(sample_size) + \ "%29+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms" r2 = requests.get(sample_url) json_dict2 = json.loads(r2.content) sampled_data = json_dict2["results"][0]["series"][0]["values"] # [[time, value], ...] sample = [item[1] for item in sampled_data] #[value,...] #fill the sample data with a linear model start_x = data[0][0] end_x = data[-1][0] current_x = start_x current_loc = 0 slope = (sampled_data[current_loc][1]-sampled_data[current_loc+2][1])\ /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0]) ##NOTE! intersection = sampled_data[current_loc][1]-slope*sampled_data[current_loc][0] sample_fit = [] end_sample_x = sampled_data[-1][0] while current_x <= end_sample_x: if current_x >= sampled_data[current_loc+1][0] and current_loc+1 < len(sampled_data)-2: ##NOTE: -2 !! CHANGE TO -1 LATER current_loc+=1 ##NOTE: +2 was just for h2o_feet if (sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) == 0: slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \ /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0]) else: slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \ /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0]) intersection = sampled_data[current_loc][1] - slope*sampled_data[current_loc][0] sample_fit.append([current_x, slope*current_x+intersection]) current_x += time_interval #1000ms #chop the original data to match the linear fit sample data. chopped_data = [] for item in data: if item[0]>= sample_fit[0][0] and item[0] <= sample_fit[-1][0]: chopped_data.append(item) print("size of chopped_data:",len(chopped_data)) chopped_lst2 = [item[1] for item in chopped_data] chopped_len = len(chopped_lst2) #build a sax model for chopped original data sax = SymbolicAggregateApproximation(chopped_len,alphabet_size_avg) scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.) sdb = scalar.fit_transform(chopped_lst2) sax_data = sax.transform(sdb) s3 = sax.fit_transform(sax_data) #build a sax model for linear-fit sampled data sample_fit_extract = [item[1] for item in sample_fit] fit_sample_data = scalar.fit_transform(sample_fit_extract) sax_sample_data = sax.transform(fit_sample_data) s4 = sax.fit_transform(sax_sample_data) #compute the distance between to dataset to calculate the similarity dist = sax.distance_sax(s3[0], s4[0]) print("distance:", dist) norm_dist = 1000*dist/chopped_len distlist.append(norm_dist) print("normalized distance: {:.4f}".format(norm_dist)) plotdist(ratiolist,distlist)
def step_run(self, data): sax = SymbolicAggregateApproximation(n_segments=self.nb_segment, alphabet_size_avg=self.nb_symbol) sax_dataset = sax.fit_transform(data) sax_dataset_inv = sax.inverse_transform(sax_dataset) return sax_dataset, sax_dataset_inv
def test_sax(): unfitted_sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2) data = [[-1., 2., 0.1, -1., 1., -1.], [1., 3.2, -1., -3., 1., -1.]] np.testing.assert_raises(ValueError, unfitted_sax.distance, data[0], data[1])
def main(): x = [] #x que será passado para o knn y = [] #y que será passado para o knn y_aux = [] #valores originais do dataset x_aux = [] #datas originais do dataset y_saida = [ ] #possui 80% de seus valores como sendo os valores das bolsas originais e os 20% restante vão ser da prdição with open('output.ou') as f: #aqui eu só carrego os valores do meu dataset for linha in f: linha = linha.strip() if linha: valores = linha.split(',') a, b = trataValores(valores) x_aux.append(a) y_aux.append(b) x_aux, y_aux = ndtw.suavizacao(x_aux, y_aux) #função que suaviza os gráficos # maior = max(y_aux) #essa e as proximas 2 linhas normalizam os dados pois # y_aux = np.array(y_aux)#é necessário que os valores estejam entre # y_aux = y_aux/maior#0 e 1 pra que o PAA e consequentemente o SAX funcionem y_aux = ndtw.sigmoid(y_aux, 1) sax = SymbolicAggregateApproximation(n_segments=N_PAA, alphabet_size_avg=N_SAX) temp = sax.fit_transform(y_aux) classes_sax = [] for i in temp[0]: classes_sax.append(i[0]) count = 0 cel = [] for i in classes_sax[0:int( len(classes_sax) * 0.8 )]: #for que itera até 80% da lista criando o meu x e y que serão passados pra o knn #nesse caso x = [val1, val2, val3,...,valn] e y = val. y tem o tamnho de WIN_SIZE #basicamente to criando o dataset de entrada do knn com a janela deslizante count += 1 y_saida.append(i) if (count % (WIN_SIZE + 1) == 0 and count != 0): cel.append(i) # cel = ndtw.sliding_window_normalizations([],cel,1) #faço as normalizações com média e desvio padrão y.append(cel[-1:]) #o ultimo valor normalizado é meu y x.append(cel[:WIN_SIZE]) #os primeiro WIN_SIZE valores são o meu x cel = [] else: cel.append(i) obj = KNeighborsClassifier(metric=dtw, n_neighbors=1) # print "\n" # print y_saida obj.fit(x, y) # for i in range(int(len(y_aux)*0.2)+1): #slicing lists like a BALLLSS # passar = np.array(y_saida[-WIN_SIZE:]).reshape(1,-1) #transformo a janela em numpy array e dou um reshape pq o knn reclama # volta = np.copy(passar) #esse volta é uma cópia de passar que serve para armazenar os valores originais antes da normalização com a média e o desvio padrão pra que futuramente eu possa reverter a normalização pra apresentar os dados # passar = ndtw.sliding_window_normalizations([],passar,1) #normalizo com a média e desvio padrão # pred = obj.predict(passar)[0] #pego a predição normalizada # passar = np.append(passar,pred) #adiciono ela nos valores da qual a predição foi feita (os valores e a predição estão normalizados) # passar = ndtw.sliding_window_normalizations(volta,passar,0) #tiro a normlização pra jogar na lista de saida # y_saida.append(passar[-1:]) #coloco o valor obtido na lista de saída for i in range(int(len(classes_sax) * 0.2) + 1): #slicing lists like a BALLLSS passar = np.array(y_saida[-WIN_SIZE:]).reshape( 1, -1 ) #transformo a janela em numpy array e dou um reshape pq o knn reclama pred = obj.predict(passar)[0] #pego a predição normalizada passar = np.append( passar, pred ) #adiciono ela nos valores da qual a predição foi feita (os valores e a predição estão normalizados) y_saida.append( passar[-1:][0]) #coloco o valor obtido na lista de saída saida = [] saida.append([]) for i in y_saida: #gambito pq não sei usar reshape saida[0].append([i]) y_saida = sax.inverse_transform(saida) y_saida = np.array(y_saida) saida = [] #se o takashi ver isso ele vai me bater (pray for dave) for i in y_saida: #doooooooooooble gambito pq não sei usar reshape for j in i: for k in j: saida.append(k) y_aux = ndtw.sigmoid(y_aux, 0) return x_aux, y_aux, saida
dayPattern = [] for index in range(restData.shape[0]): cuData = restData[index].ravel() day = len(cuData) // 24 total = np.zeros(24) for d in range(3): total += cuData[d * 24:(d + 1) * 24] dayPattern.append(total / day) dayPattern = np.array(dayPattern) scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) dayPattern = scaler.fit_transform(dayPattern) n_paa_segments = 24 n_sax_symbols = 5 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) dayPattern = sax.fit_transform(dayPattern) dayPattern = dayPattern.reshape(dayPattern.shape[0], dayPattern.shape[1]) #进行聚类 # 对SAX处理后的日变动进行50聚类 s = time.time() y_pre = KMeans(n_clusters=20).fit_predict(dayPattern) clusNum = np.zeros(len(y_pre)) totalClus = 0 for k in range(max(y_pre) + 1): data = restData[y_pre == k] data = data.reshape(data.shape[0], data.shape[1]) distance_matrix = trend_affinity(data) model = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='complete',
def build_model(self, **kwargs): self.his_len = kwargs['his_len'] self.segment_dim = kwargs['segment_dim'] self.model_obj = SymbolicAggregateApproximation( n_segments=self.his_len, alphabet_size_avg=self.param.n_state)
# Transform PAA, SAX, 1d-SAX, for stockCode in pos_relatedStock: dataset = dfpivot['v_updownpercent'][stockCode] scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.) # Rescale time series dataset = scaler.fit_transform(dataset) # PAA transform (and inverse transform) of the data n_paa_segments = 10 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset)) # SAX transform n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset)) # 1d-SAX transform n_sax_symbols_avg = 8 n_sax_symbols_slope = 8 one_d_sax = OneD_SymbolicAggregateApproximation( n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols_avg, alphabet_size_slope=n_sax_symbols_slope) one_d_sax_dataset_inv = one_d_sax.inverse_transform( one_d_sax.fit_transform(dataset)) graph_idx = graph_idx + 1 plt.subplot(len(pos_relatedStock), 4, graph_idx) # First, raw time series plt.plot(dataset[0].ravel(), "b-")
'Stock Fluctuations of 4 Renowned Telco Companies from Jan to Mar 2019') plt.legend(loc='upper right') # In[ ]: # Performing PAA and SAX. from tslearn.piecewise import PiecewiseAggregateApproximation from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation n_paa_segments = 8 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) Digi_PAA_n8 = paa.inverse_transform(paa.fit_transform(Digi_Scaled)) n_sax_symbols = 8 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) Digi_SAX_n8 = sax.inverse_transform(sax.fit_transform(Digi_Scaled)) n_paa_segments = 16 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) Digi_PAA_n16 = paa.inverse_transform(paa.fit_transform(Digi_Scaled)) n_sax_symbols = 16 sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols) Digi_SAX_n16 = sax.inverse_transform(sax.fit_transform(Digi_Scaled)) # In[ ]: # Visualize the PAA and SAX with different segments and symbols.