Esempio n. 1
0
def perform_sax(dataset, gram_number, symbols, segments):
    scaler = TimeSeriesScalerMeanVariance(
        mu=0., std=np.std(dataset))  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # SAX transform
    sax = SymbolicAggregateApproximation(n_segments=segments,
                                         alphabet_size_avg=symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))
    # print(pd.DataFrame(sax_dataset_inv[0])[0].value_counts())
    #     sax_dataset_inv = sax.fit_transform(dataset)
    #     print(len(sax_dataset_inv[0]))

    # Convert result to strings
    df_sax = pd.DataFrame(sax_dataset_inv[0])
    sax_series = df_sax[0]

    # Convert sax from numeric to characters
    sax_values = sax_series.unique()
    alphabet = 'abcdefghijklmnopqrstuvw'
    sax_dict = {x: alphabet[i] for i, x in enumerate(sax_values)}
    sax_list = [sax_dict[x] for x in sax_series]

    # Convert the list of characters to n_grams based on input parameter
    tri = n_grams(gram_number, sax_list)
    #     print(Counter(tri))
    return tri
 def transform(self, data=None):
     sax = SymbolicAggregateApproximation(n_segments=self.n_paa, alphabet_size_avg=self.n_sax)    
     self.trans_dataset = sax.fit_transform(self.norm_dataset)
     if data == None:
         self.invTrans_dataset = sax.inverse_transform(self.trans_dataset)
     else:
         self.invTrans_dataset = sax.inverse_transform(data)
Esempio n. 3
0
def genListSAX(instances_nor, windowSize, timestamp, n_sax_symbols=25):
    sax = SymbolicAggregateApproximation(n_segments=windowSize,
                                         alphabet_size_avg=n_sax_symbols)
    sax_result = sax.fit_transform(instances_nor)
    sax_dataset_inv = sax.inverse_transform(sax_result)
    return {
        "sketchInstances": list(sax_dataset_inv[0].ravel()),
        "timestamp": timestamp
    }
Esempio n. 4
0
    def _sax_preprocess(self, X, n_segments=10, alphabet_size_avg=4):
        # Now SAX-transform the time series
        if not hasattr(self, '_sax') or self._sax is None:
            self._sax = SymbolicAggregateApproximation(
                n_segments=n_segments, alphabet_size_avg=alphabet_size_avg)

        X = to_time_series_dataset(X)
        X = self._sax.fit_transform(X)

        return X
def saa_pax(dataset, title):
    """
    Show the graph of PAA and SAX of time series data
    :param dataset: time series of a stock
    :return:
    """
    n_ts, sz, d = 1, 100, 1
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

    # SAX transform
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))

    # 1d-SAX transform
    n_sax_symbols_avg = 8
    n_sax_symbols_slope = 8
    one_d_sax = OneD_SymbolicAggregateApproximation(
        n_segments=n_paa_segments,
        alphabet_size_avg=n_sax_symbols_avg,
        alphabet_size_slope=n_sax_symbols_slope)
    one_d_sax_dataset_inv = one_d_sax.inverse_transform(
        one_d_sax.fit_transform(dataset))

    plt.figure()
    plt.subplot(2, 2, 1)  # First, raw time series
    plt.plot(dataset[0].ravel(), "b-")
    plt.title("Raw time series " + title)

    plt.subplot(2, 2, 2)  # Second, PAA
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(paa_dataset_inv[0].ravel(), "b-")
    plt.title("PAA " + title)

    plt.subplot(2, 2, 3)  # Then SAX
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(sax_dataset_inv[0].ravel(), "b-")
    plt.title("SAX, %d symbols" % n_sax_symbols)

    plt.subplot(2, 2, 4)  # Finally, 1d-SAX
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(one_d_sax_dataset_inv[0].ravel(), "b-")
    plt.title("1d-SAX, %d symbols (%dx%d)" %
              (n_sax_symbols_avg * n_sax_symbols_slope, n_sax_symbols_avg,
               n_sax_symbols_slope))

    plt.tight_layout()
    plt.show()
Esempio n. 6
0
def discretize(raw_signal, window_size, paa_segments, alphabet_size):
    sax = SymbolicAggregateApproximation(n_segments=paa_segments, alphabet_size_avg=alphabet_size)
    discrete_signal = []
    num = len(raw_signal)//window_size

    for i in range(num):
        raw_data = raw_signal[i*window_size : (i+1)*window_size]
        disc = sax.inverse_transform(sax.fit_transform(raw_data))
        discrete_signal.append(np.squeeze(disc))
    discrete_signal = [x for sublist in discrete_signal for x in sublist]

    return discrete_signal
Esempio n. 7
0
def test_serialize_sax():
    n_paa_segments = 10
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)

    _check_not_fitted(sax)

    X = _get_random_walk()

    sax.fit(X)

    _check_params_predict(sax, X, ['transform'])
Esempio n. 8
0
def test_sax_scale():
    n, sz, d = 10, 10, 3
    rng = np.random.RandomState(0)
    X = rng.rand(n, sz, d)
    y = rng.choice([0, 1], size=n)

    sax = SymbolicAggregateApproximation(n_segments=3,
                                         alphabet_size_avg=2,
                                         scale=True)
    sax.fit(X)
    np.testing.assert_array_almost_equal(X,
                                         sax._unscale(sax._scale(X)))

    np.testing.assert_array_almost_equal(np.zeros((d, )),
                                         sax._scale(X).reshape((-1, d)).mean())
    np.testing.assert_array_almost_equal(np.ones((d, )),
                                         sax._scale(X).reshape((-1, d)).std())

    # Case of kNN-SAX
    knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax",
                                             metric_params={"scale": True})
    knn_sax.fit(X, y)
    X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X))
    np.testing.assert_array_almost_equal(X, X_scale_unscale)

    knn_sax.predict(X)
Esempio n. 9
0
 def build_tslearn_sax(n_paa_segments=50,
                       n_sax_symbols=50,
                       supports_approximation=True):
     sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                          alphabet_size_avg=n_sax_symbols)
     return TSLearnTransformerWrapper(
         sax, supports_approximation=supports_approximation)
Esempio n. 10
0
def sax_sim_matrix(df: np.ndarray, word_len, alphabet_len):
    '''
    Computes the sax distance for a series,
    with specified alphabet length and word length
    '''
    sax = SymbolicAggregateApproximation(word_len, alphabet_len)
    sax.fit(df)

    n_series = df.shape[0]
    sim_matrix = np.zeros((n_series, n_series))

    for i in range(n_series):
        for j in range(n_series):
            sim_matrix[i][j] = sax.distance(df[i], df[j])

    return sim_matrix
Esempio n. 11
0
def sax_similarity(data, seq_len):
    from tslearn.piecewise import SymbolicAggregateApproximation

    print('|--- Calculating the pairwise distance!')

    ppa_segmet = int(data.shape[0] / seq_len)
    sax_ins = SymbolicAggregateApproximation(n_segments=ppa_segmet, alphabet_size_avg=10)

    sax_repre = sax_ins.fit_transform(np.transpose(data))

    sax_mx_dist = np.zeros(shape=(data.shape[1], data.shape[1]))

    for i in range(data.shape[1]):
        for j in range(data.shape[1]):
            sax_mx_dist[i, j] = sax_ins.distance_sax(sax_repre[i], sax_repre[j])

    return sax_mx_dist
Esempio n. 12
0
class SAXStateRecognition(BaseMLModelTemplate):
    def build_model(self, **kwargs):
        self.his_len = kwargs['his_len']
        self.segment_dim = kwargs['segment_dim']
        self.model_obj = SymbolicAggregateApproximation(
            n_segments=self.his_len, alphabet_size_avg=self.param.n_state)

    def fit(self, x, y=None):
        self.store(self.param.model_save_path)

    def predict(self, x):
        self.restore(self.param.model_save_path)

        sax_dataset_inv = self.model_obj.inverse_transform(
            self.model_obj.fit_transform(x))
        uniques = sorted(np.unique(sax_dataset_inv))
        print('sax numbers:', len(uniques))
        state_pattern = np.eye(len(uniques))

        state_proba = np.zeros(
            [x.shape[0], self.his_len, len(uniques)], dtype=np.float)
        tmpstates = np.reshape(sax_dataset_inv,
                               [-1, self.his_len, self.segment_dim])
        for i in range(tmpstates.shape[0]):
            for j in range(tmpstates.shape[1]):
                index = uniques.index(tmpstates[i, j, 0])
                state_proba[i, j, index] = tmpstates[i, j, 0]

        return np.reshape(state_proba,
                          [-1, self.his_len, self.param.n_state]).astype(
                              np.float32), np.array(state_pattern,
                                                    dtype=np.float32)

    def store(self, path, **kwargs):
        save_model_name = "sax_{}_{}.state_model".format(
            self.param.data_name, self.param.n_state)
        joblib.dump(self.model_obj, os.path.join(path, save_model_name))

    def restore(self, path, **kwargs):
        save_model_name = "sax_{}_{}.state_model".format(
            self.param.data_name, self.param.n_state)
        self.model_obj = joblib.load(os.path.join(path, save_model_name))
Esempio n. 13
0
def discretize_series(series_raw,
                      n_sax_symbols=6,
                      hours_in_segment=4,
                      inverse_transform=False):

    # Copy series
    series = series_raw.copy()

    # Normalize / rescale series
    data = normalize_series(series)

    # Determine PAA segment length for # of hours in segment
    n_paa_segments = __get_n_paa_segments(len(series), hours_in_segment)

    # SAX (and PAA) transform
    sax = SymbolicAggregateApproximation(alphabet_size_avg=n_sax_symbols,
                                         n_segments=n_paa_segments)
    model = sax.fit(data)
    transformed = model.transform(data)

    # (Optional) Transform discrete samples back to time series
    if inverse_transform: transformed = model.inverse_transform(transformed)

    return transformed, model
Esempio n. 14
0
def test_sax():
    unfitted_sax = SymbolicAggregateApproximation(n_segments=3,
                                                  alphabet_size_avg=2)
    data = [[-1., 2., 0.1, -1., 1., -1.], [1., 3.2, -1., -3., 1., -1.]]
    np.testing.assert_raises(NotFittedError, unfitted_sax.distance, data[0],
                             data[1])

    sax_est_no_scale = unfitted_sax
    sax_est_scale = clone(sax_est_no_scale)
    print(sax_est_scale.set_params)
    sax_est_scale.set_params(scale=True)
    n, sz, d = 2, 10, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, sz, d)
    for sax_est in [sax_est_no_scale, sax_est_scale]:
        sax_repr = sax_est.fit_transform(X)
        np.testing.assert_allclose(
            sax_est.distance(X[0], X[1]),
            sax_est.distance_sax(sax_repr[0], sax_repr[1]))
Esempio n. 15
0
class KNeighborsTimeSeriesMixin():
    """Mixin for k-neighbors searches on Time Series."""

    def _sax_preprocess(self, X, n_segments=10, alphabet_size_avg=4,
                        scale=False):
        # Now SAX-transform the time series
        if not hasattr(self, '_sax') or self._sax is None:
            self._sax = SymbolicAggregateApproximation(
                n_segments=n_segments,
                alphabet_size_avg=alphabet_size_avg,
                scale=scale
            )

        X = to_time_series_dataset(X)
        X_sax = self._sax.fit_transform(X)

        return X_sax

    def _get_metric_params(self):
        if self.metric_params is None:
            metric_params = {}
        else:
            metric_params = self.metric_params.copy()
        if "gamma_sdtw" in metric_params.keys():
            metric_params["gamma"] = metric_params["gamma_sdtw"]
            del metric_params["gamma_sdtw"]
        if "n_jobs" in metric_params.keys():
            del metric_params["n_jobs"]
        if "verbose" in metric_params.keys():
            del metric_params["verbose"]
        return metric_params

    def _precompute_cross_dist(self, X, other_X=None):
        if other_X is None:
            other_X = self._ts_fit

        self._ts_metric = self.metric
        self.metric = "precomputed"

        metric_params = self._get_metric_params()

        X = check_array(X, allow_nd=True, force_all_finite=False)
        X = to_time_series_dataset(X)

        if self._ts_metric == "dtw":
            X_ = cdist_dtw(X, other_X, n_jobs=self.n_jobs,
                           **metric_params)
        elif self._ts_metric == "ctw":
            X_ = cdist_ctw(X, other_X, **metric_params)
        elif self._ts_metric == "softdtw":
            X_ = cdist_soft_dtw(X, other_X, **metric_params)
        elif self._ts_metric == "sax":
            X = self._sax_preprocess(X, **metric_params)
            X_ = cdist_sax(X, self._sax.breakpoints_avg_,
                           self._sax._X_fit_dims_[1], other_X,
                           n_jobs=self.n_jobs)
        else:
            raise ValueError("Invalid metric recorded: %s" %
                             self._ts_metric)

        return X_

    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
        """Finds the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : array-like, shape (n_ts, sz, d)
            The query time series.
            If not provided, neighbors of each indexed point are returned.
            In this case, the query point is not considered its own neighbor.
        n_neighbors : int
            Number of neighbors to get (default is the value passed to the
            constructor).
        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the distance to points, only present if
            return_distance=True
        ind : array
            Indices of the nearest points in the population matrix.
        """
        self_neighbors = False
        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        if X is None:
            X = self._X_fit
            self_neighbors = True
        if self.metric == "precomputed":
            full_dist_matrix = X
        else:

            if X.ndim == 2:  # sklearn-format case
                X = X.reshape((X.shape[0], -1, self._d))
                fit_X = self._X_fit.reshape((self._X_fit.shape[0],
                                             -1,
                                             self._d))
            elif hasattr(self, '_ts_fit') and self._ts_fit is not None:
                fit_X = self._ts_fit
            else:
                fit_X = self._X_fit

            if (self.metric in TSLEARN_VALID_METRICS or
                    self.metric in [cdist_dtw, cdist_ctw,
                                    cdist_soft_dtw, cdist_sax]):
                full_dist_matrix = self._precompute_cross_dist(X,
                                                               other_X=fit_X)
            elif self.metric in ["euclidean", "sqeuclidean", "cityblock"]:
                full_dist_matrix = scipy_cdist(X.reshape((X.shape[0], -1)),
                                               fit_X.reshape((fit_X.shape[0],
                                                              -1)),
                                               metric=self.metric)
            else:
                raise ValueError("Unrecognized time series metric string: %s "
                                 "(should be one of 'dtw', 'softdtw', "
                                 "'sax', 'euclidean', 'sqeuclidean' "
                                 "or 'cityblock')" % self.metric)

        # Code similar to sklearn (sklearn/neighbors/base.py), to make sure
        # that TimeSeriesKNeighbor~(metric='euclidean') has the same results as
        # feeding a distance matrix to sklearn.KNeighbors~(metric='euclidean')
        kbin = min(n_neighbors - 1, full_dist_matrix.shape[1] - 1)
        # argpartition will make sure the first `kbin` entries are the
        # `kbin` smallest ones (but in arbitrary order) --> complexity: O(n)
        ind = numpy.argpartition(full_dist_matrix, kbin, axis=1)

        if self_neighbors:
            ind = ind[:, 1:]
        if n_neighbors > full_dist_matrix.shape[1]:
            n_neighbors = full_dist_matrix.shape[1]
        ind = ind[:, :n_neighbors]

        n_ts = X.shape[0]
        sample_range = numpy.arange(n_ts)[:, None]
        # Sort the `kbin` nearest neighbors according to distance
        ind = ind[
            sample_range, numpy.argsort(full_dist_matrix[sample_range, ind])]
        dist = full_dist_matrix[sample_range, ind]

        if hasattr(self, '_ts_metric'):
            self.metric = self._ts_metric

        if return_distance:
            return dist, ind
        else:
            return ind
        dist3 = paa.distance(Xtrain_paa[i,:],Xtest_paa[j,:])
        PAADist_train.append(dist3)

for i in range(len(y_test)):
    for j in range(len(y_train)):
        dist4 = paa.distance(Xtest_paa[i,:],Xtrain_paa[j,:])
        PAADist_test.append(dist4)   

PAADist_train = np.array(PAADist_train)
PAADist_train.resize(y_train.shape[0],int(len(PAADist_train)/y_train.shape[0]))
PAADist_test = np.array(PAADist_test)
PAADist_test.resize(y_test.shape[0],int(len(PAADist_test)/y_test.shape[0]))
'''
#SAX Transform + SAX feature extraction

sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
Xtrain_sax = sax.inverse_transform(sax.fit_transform(X_train))
Xtest_sax = sax.inverse_transform(sax.fit_transform(X_test))

SAX_test = Xtest_sax[:, :, 0]
SAX_train = Xtrain_sax[:, :, 0]
'''
#SAX distance calculation
SAXDist_train = []
SAXDist_test = []

for i in range(len(y_train)):
    for j in range(len(y_train)):
        dist3 = sax.distance(Xtrain_sax[i,:],Xtest_sax[j,:])
        SAXDist_train.append(dist3)
Esempio n. 17
0
 def fit(self, x, y=None):
     sax = SymbolicAggregateApproximation(n_segments=self.hislen,
                                          alphabet_size_avg=self.state_num)
     joblib.dump(sax, self.modelpath / 'states.m')
        dataset = []
        with open('output.ou') as f: #aqui eu só carrego os valores do meu dataset
            for linha in f:
                linha = linha.strip()
                if linha:
                    valores = linha.split(',')
                    a,b = int(valores[0]), float(valores[1])
                    dataset.append(b)
        self.dataset = dataset[:] #gambito do welsu pra não passar por referência

        # exit = transform(300,300,dataset)
        # print exit
        # print len(exit[0])
np.set_printoptions(threshold='nan')
trans = Transform(300,300)
trans.read()
# trans.norm()
# trans.transform()
# trans.norm-(0)
# print trans.dataset
# print trans.invTrans_dataset[0]

sax = SymbolicAggregateApproximation(n_segments=300, alphabet_size_avg=300) 
trans.norm()   
aux = sax.fit_transform(trans.dataset)
aux1 = sax.inverse_transform(aux)
print trans.dataset
print aux
print aux1

# Nearest neighbor search
knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw")
knn.fit(X_train, y_train)
dists, ind = knn.kneighbors(X_test)
print("1. Nearest neighbour search")
print("Computed nearest neighbor indices (wrt DTW)\n", ind)
print("First nearest neighbor class:", y_test[ind[:, 0]])

# Nearest neighbor classification
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
knn_clf.fit(X_train, y_train)
predicted_labels = knn_clf.predict(X_test)
print("\n2. Nearest neighbor classification using DTW")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

# Nearest neighbor classification with a different metric (Euclidean distance)
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
knn_clf.fit(X_train, y_train)
predicted_labels = knn_clf.predict(X_test)
print("\n3. Nearest neighbor classification using L2")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

# Nearest neighbor classification  based on SAX representation
sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5)
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
pipeline_model = Pipeline(steps=[('sax', sax_trans), ('knn', knn_clf)])
pipeline_model.fit(X_train, y_train)
predicted_labels = pipeline_model.predict(X_test)
print("\n4. Nearest neighbor classification using SAX+MINDIST")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))
Esempio n. 20
0
def main():
    # fetch original data
    #for test_quarter db
    ##    influx_url = "http://localhost:8086/query?db=" + dbname + \
    ##                 "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546329600000ms+and+time+%3C%3D+1546329900000ms"

    #FOR NOAA DB
    ##    influx_url = "http://localhost:8086/query?db=" + dbname + \
    ##                 "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1439856000000ms+and+time+%3C%3D+1439992520000ms+and%28%22location%22+%3D+%27santa_monica%27%29"
    # For test3
    influx_url = "http://localhost:8086/query?db=" + dbname + \
                 "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546355705400ms+and+time+%3C%3D+1548969305400ms"

    r = requests.get(influx_url)
    json_dict = json.loads(r.content)

    data = json_dict["results"][0]["series"][0]["values"]
    ##    print(data[0])
    ##    print(data[1])
    time_interval = data[1][0] - data[0][0]  # consistant time interval
    print("time interval: ", time_interval)

    lst2 = [item[1] for item in data]
    n_segments = len(lst2)

    print("original data size", len(lst2))
    alphabet_size_avg = 20

    #generate sample data
    sample_size = 20
    ##    sample_url = "http://localhost:8086/query?db="+dbname+\
    ##                 "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\
    ##                 "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546329600000ms+and+time+%3C%3D+1546329900000ms"
    # test3 sample (sin pattern)
    sample_url = "http://localhost:8086/query?db="+dbname+\
             "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\
             "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546355705400ms+and+time+%3C%3D+1548969305400ms"

    ##    sample_url = "http://localhost:8086/query?db=" + dbname + \
    ##                 "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\
    ##                 "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1439856000000ms+and+time+%3C%3D+1442612520000ms+and%28%22location%22+%3D+%27santa_monica%27%29"

    r2 = requests.get(sample_url)
    json_dict2 = json.loads(r2.content)
    sampled_data = json_dict2["results"][0]["series"][0][
        "values"]  # [[time, value], ...]

    print("sample length")
    print(len(sampled_data))

    sample = [item[1] for item in sampled_data]  #[value,...]

    #fill the sample data with a linear model
    start_x = data[0][0]
    end_x = data[-1][0]
    current_x = start_x
    current_loc = 0

    slope = (sampled_data[current_loc][1]-sampled_data[current_loc+1][1])\
            /(sampled_data[current_loc][0] - sampled_data[current_loc+1][0])
    intersection = sampled_data[current_loc][
        1] - slope * sampled_data[current_loc][0]

    sample_fit = []
    end_sample_x = sampled_data[-1][0]

    while current_x <= end_sample_x:
        if current_x >= sampled_data[
                current_loc +
                1][0] and current_loc + 1 < len(sampled_data) - 1:
            current_loc += 1
            slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                    /(sampled_data[current_loc][0] - sampled_data[current_loc+1][0])
            intersection = sampled_data[current_loc][
                1] - slope * sampled_data[current_loc][0]

        sample_fit.append([current_x, slope * current_x + intersection])
        current_x += time_interval  #1000ms

    #chop the original data to match the linear fit sample data.
    chopped_data = []
    for item in data:
        if item[0] >= sample_fit[0][0] and item[0] <= sample_fit[-1][0]:
            chopped_data.append(item)
    print("len")
    print(len(sample_fit), len(chopped_data))
    chopped_lst2 = [item[1] for item in chopped_data]
    chopped_len = len(chopped_lst2)

    #build a sax model for chopped original data
    sax = SymbolicAggregateApproximation(chopped_len, alphabet_size_avg)
    scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    sdb = scalar.fit_transform(chopped_lst2)
    sax_data = sax.transform(sdb)
    s3 = sax.fit_transform(sax_data)

    #build a sax model for linear-fit sampled data
    sample_fit_extract = [item[1] for item in sample_fit]
    fit_sample_data = scalar.fit_transform(sample_fit_extract)
    sax_sample_data = sax.transform(fit_sample_data)
    s4 = sax.fit_transform(sax_sample_data)

    #compute the distance between to dataset to calculate the similarity
    print("distance")
    dist = sax.distance_sax(s3[0], s4[0])
    print(dist)
    print("normalized distance")
    print(dist / chopped_len)

    #plot the three dataset
    plot(sample_fit, sampled_data, lst2)
Esempio n. 21
0
def main():
    #FOR NOAA DB
    influx_url = "http://localhost:8086/query?db=" + dbname + \
                    "&epoch=ms&q=SELECT+%22water_level%22+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms"

    r = requests.get(influx_url)
    json_dict = json.loads(r.content)

    data = json_dict["results"][0]["series"][0]["values"]
    print(data[0:5])
    
##    #NOTE:just for NOAA h2o_feet
    time_interval = data[2][0] - data[0][0]
    print("time interval:", time_interval)
   
    lst2 = [item[1] for item in data]
    n_segments = len(lst2)

    print(max(lst2),min(lst2))
    
    original_data_size = len(lst2)
    print("original data size:", original_data_size)
    
    alphabet_size_avg = math.ceil(max(lst2)-min(lst2))
    print("alphabet size avg:", alphabet_size_avg)


    ## a list of sample ratios.
    ## Want to select the min ratio within the similarity range.
    ratiolist = [0.025,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.6]
    sizelist = []
    distlist = []
    
    for ratio in ratiolist:
        print()
        print("ratio:",ratio)
            
        #generate sample data
        sample_size = math.floor(original_data_size * ratio)
        sizelist.append(sample_size)
        print("sample_size:",sample_size)

       #NOAA DB: h2o_feet
        sample_url = "http://localhost:8086/query?db=" + dbname + \
                    "&epoch=ms&q=SELECT+sample%28%22water_level%22%2C"+str(sample_size) + \
                    "%29+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms"
        
        r2 = requests.get(sample_url)
        json_dict2 = json.loads(r2.content)
        sampled_data = json_dict2["results"][0]["series"][0]["values"] # [[time, value], ...]
        
        sample = [item[1] for item in sampled_data] #[value,...]

        #fill the sample data with a linear model
        start_x = data[0][0]
        end_x = data[-1][0]
        current_x = start_x
        current_loc = 0
        
        slope = (sampled_data[current_loc][1]-sampled_data[current_loc+2][1])\
                /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])      ##NOTE!
        intersection = sampled_data[current_loc][1]-slope*sampled_data[current_loc][0]

        sample_fit = []
        end_sample_x = sampled_data[-1][0]

        while current_x <= end_sample_x:
            if current_x >= sampled_data[current_loc+1][0] and current_loc+1 < len(sampled_data)-2:  ##NOTE: -2 !! CHANGE TO -1 LATER
                current_loc+=1
                ##NOTE: +2 was just for h2o_feet
                if (sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) == 0:
    
                    slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                            /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])
                else:
                    slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                            /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])

                    
                intersection = sampled_data[current_loc][1] - slope*sampled_data[current_loc][0]
            
            
            sample_fit.append([current_x, slope*current_x+intersection])
            current_x += time_interval #1000ms
           
        #chop the original data to match the linear fit sample data.
        chopped_data = []
        for item in data:
            if item[0]>= sample_fit[0][0] and item[0] <= sample_fit[-1][0]:
                chopped_data.append(item)
        print("size of chopped_data:",len(chopped_data))

        chopped_lst2 = [item[1] for item in chopped_data]
        chopped_len = len(chopped_lst2)

        #build a sax model for chopped original data
        sax = SymbolicAggregateApproximation(chopped_len,alphabet_size_avg)
        scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.)    
        sdb = scalar.fit_transform(chopped_lst2)
        sax_data = sax.transform(sdb)
        s3 = sax.fit_transform(sax_data)

        #build a sax model for linear-fit sampled data
        sample_fit_extract = [item[1] for item in sample_fit]
        fit_sample_data = scalar.fit_transform(sample_fit_extract)
        sax_sample_data = sax.transform(fit_sample_data)
        s4 = sax.fit_transform(sax_sample_data)

        #compute the distance between to dataset to calculate the similarity       
        dist = sax.distance_sax(s3[0], s4[0])
        print("distance:", dist)
        norm_dist = 1000*dist/chopped_len
        distlist.append(norm_dist)
        print("normalized distance: {:.4f}".format(norm_dist))

    plotdist(ratiolist,distlist)
 def step_run(self, data):
     sax = SymbolicAggregateApproximation(n_segments=self.nb_segment,
                                          alphabet_size_avg=self.nb_symbol)
     sax_dataset = sax.fit_transform(data)
     sax_dataset_inv = sax.inverse_transform(sax_dataset)
     return sax_dataset, sax_dataset_inv
def test_sax():
    unfitted_sax = SymbolicAggregateApproximation(n_segments=3,
                                                  alphabet_size_avg=2)
    data = [[-1., 2., 0.1, -1., 1., -1.], [1., 3.2, -1., -3., 1., -1.]]
    np.testing.assert_raises(ValueError, unfitted_sax.distance, data[0],
                             data[1])
Esempio n. 24
0
def main():
    x = []  #x que será passado para o knn
    y = []  #y que será passado para o knn
    y_aux = []  #valores originais do dataset
    x_aux = []  #datas originais do dataset
    y_saida = [
    ]  #possui 80% de seus valores como sendo os valores das bolsas originais e os 20% restante vão ser da prdição
    with open('output.ou') as f:  #aqui eu só carrego os valores do meu dataset
        for linha in f:
            linha = linha.strip()
            if linha:
                valores = linha.split(',')
                a, b = trataValores(valores)
                x_aux.append(a)
                y_aux.append(b)

    x_aux, y_aux = ndtw.suavizacao(x_aux,
                                   y_aux)  #função que suaviza os gráficos
    # maior = max(y_aux) #essa e as proximas 2 linhas normalizam os dados pois
    # y_aux = np.array(y_aux)#é necessário que os valores estejam entre
    # y_aux = y_aux/maior#0 e 1 pra que o PAA e consequentemente o SAX funcionem

    y_aux = ndtw.sigmoid(y_aux, 1)
    sax = SymbolicAggregateApproximation(n_segments=N_PAA,
                                         alphabet_size_avg=N_SAX)
    temp = sax.fit_transform(y_aux)
    classes_sax = []
    for i in temp[0]:
        classes_sax.append(i[0])

    count = 0
    cel = []
    for i in classes_sax[0:int(
            len(classes_sax) * 0.8
    )]:  #for que itera até 80% da lista criando o meu x e y que serão passados pra o knn
        #nesse caso x = [val1, val2, val3,...,valn] e y = val. y tem o tamnho de WIN_SIZE
        #basicamente to criando o dataset de entrada do knn com a janela deslizante
        count += 1
        y_saida.append(i)
        if (count % (WIN_SIZE + 1) == 0 and count != 0):
            cel.append(i)
            # cel = ndtw.sliding_window_normalizations([],cel,1) #faço as normalizações com média e desvio padrão
            y.append(cel[-1:])  #o ultimo valor normalizado é meu y
            x.append(cel[:WIN_SIZE])  #os primeiro WIN_SIZE valores são o meu x
            cel = []
        else:
            cel.append(i)

    obj = KNeighborsClassifier(metric=dtw, n_neighbors=1)

    # print "\n"
    # print y_saida

    obj.fit(x, y)

    # for i in range(int(len(y_aux)*0.2)+1): #slicing lists like a BALLLSS
    #     passar = np.array(y_saida[-WIN_SIZE:]).reshape(1,-1) #transformo a janela em numpy array e dou um reshape pq o knn reclama
    #     volta = np.copy(passar) #esse volta é uma cópia de passar que serve para armazenar os valores originais antes da normalização com a média e o desvio padrão pra que futuramente eu possa reverter a normalização pra apresentar os dados
    #     passar = ndtw.sliding_window_normalizations([],passar,1) #normalizo com a média e desvio padrão
    #     pred = obj.predict(passar)[0] #pego a predição normalizada
    #     passar = np.append(passar,pred) #adiciono ela nos valores da qual a predição foi feita (os valores e a predição estão normalizados)
    #     passar = ndtw.sliding_window_normalizations(volta,passar,0) #tiro a normlização pra jogar na lista de saida
    #     y_saida.append(passar[-1:]) #coloco o valor obtido na lista de saída

    for i in range(int(len(classes_sax) * 0.2) +
                   1):  #slicing lists like a BALLLSS
        passar = np.array(y_saida[-WIN_SIZE:]).reshape(
            1, -1
        )  #transformo a janela em numpy array e dou um reshape pq o knn reclama
        pred = obj.predict(passar)[0]  #pego a predição normalizada
        passar = np.append(
            passar, pred
        )  #adiciono ela nos valores da qual a predição foi feita (os valores e a predição estão normalizados)
        y_saida.append(
            passar[-1:][0])  #coloco o valor obtido na lista de saída

    saida = []
    saida.append([])

    for i in y_saida:  #gambito pq não sei usar reshape
        saida[0].append([i])

    y_saida = sax.inverse_transform(saida)
    y_saida = np.array(y_saida)

    saida = []  #se o takashi ver isso ele vai me bater (pray for dave)
    for i in y_saida:  #doooooooooooble gambito pq não sei usar reshape
        for j in i:
            for k in j:
                saida.append(k)

    y_aux = ndtw.sigmoid(y_aux, 0)
    return x_aux, y_aux, saida
Esempio n. 25
0
dayPattern = []
for index in range(restData.shape[0]):
    cuData = restData[index].ravel()
    day = len(cuData) // 24
    total = np.zeros(24)
    for d in range(3):
        total += cuData[d * 24:(d + 1) * 24]
    dayPattern.append(total / day)

dayPattern = np.array(dayPattern)
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
dayPattern = scaler.fit_transform(dayPattern)
n_paa_segments = 24
n_sax_symbols = 5
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
dayPattern = sax.fit_transform(dayPattern)
dayPattern = dayPattern.reshape(dayPattern.shape[0], dayPattern.shape[1])
#进行聚类
# 对SAX处理后的日变动进行50聚类
s = time.time()
y_pre = KMeans(n_clusters=20).fit_predict(dayPattern)
clusNum = np.zeros(len(y_pre))
totalClus = 0
for k in range(max(y_pre) + 1):
    data = restData[y_pre == k]
    data = data.reshape(data.shape[0], data.shape[1])
    distance_matrix = trend_affinity(data)
    model = AgglomerativeClustering(n_clusters=None,
                                    affinity='precomputed',
                                    linkage='complete',
Esempio n. 26
0
 def build_model(self, **kwargs):
     self.his_len = kwargs['his_len']
     self.segment_dim = kwargs['segment_dim']
     self.model_obj = SymbolicAggregateApproximation(
         n_segments=self.his_len, alphabet_size_avg=self.param.n_state)
# Transform PAA, SAX, 1d-SAX,
for stockCode in pos_relatedStock:

    dataset = dfpivot['v_updownpercent'][stockCode]
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

    # SAX transform
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))

    # 1d-SAX transform
    n_sax_symbols_avg = 8
    n_sax_symbols_slope = 8
    one_d_sax = OneD_SymbolicAggregateApproximation(
        n_segments=n_paa_segments,
        alphabet_size_avg=n_sax_symbols_avg,
        alphabet_size_slope=n_sax_symbols_slope)
    one_d_sax_dataset_inv = one_d_sax.inverse_transform(
        one_d_sax.fit_transform(dataset))

    graph_idx = graph_idx + 1
    plt.subplot(len(pos_relatedStock), 4, graph_idx)  # First, raw time series
    plt.plot(dataset[0].ravel(), "b-")
Esempio n. 28
0
    'Stock Fluctuations of 4 Renowned Telco Companies from Jan to Mar 2019')
plt.legend(loc='upper right')

# In[ ]:

# Performing PAA and SAX.

from tslearn.piecewise import PiecewiseAggregateApproximation
from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation

n_paa_segments = 8
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
Digi_PAA_n8 = paa.inverse_transform(paa.fit_transform(Digi_Scaled))

n_sax_symbols = 8
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
Digi_SAX_n8 = sax.inverse_transform(sax.fit_transform(Digi_Scaled))

n_paa_segments = 16
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
Digi_PAA_n16 = paa.inverse_transform(paa.fit_transform(Digi_Scaled))

n_sax_symbols = 16
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
Digi_SAX_n16 = sax.inverse_transform(sax.fit_transform(Digi_Scaled))

# In[ ]:

# Visualize the PAA and SAX with different segments and symbols.