コード例 #1
0
ファイル: _isax.py プロジェクト: luk-f/pyCFOFiSAX
    def __init__(self, n_segments, alphabet_size_min=2, mean=0.0, std=1.0):
        """
        Initialization function of the class IndexableSymbolicAggregateApproximation

        :returns: a class of encoding *i*\ SAX
        :rtype: IndexableSymbolicAggregateApproximation
        """

        PiecewiseAggregateApproximation.__init__(self, n_segments)
        self.n_segments = n_segments
        self.alphabet_size_min = alphabet_size_min
        self.alphabet_size_max = alphabet_size_min

        self.mean = mean
        self.std = std

        self.card_to_bkpt_ = dict()
        self.card_to_bkpt_only_ = dict()
        self.card_to_bkpt_middle_ = dict()
        self.card_to_bkpt_[self.alphabet_size_min] = _breakpoints(
            self.alphabet_size_min, scale=self.std) + self.mean
        self.card_to_bkpt_only_[self.alphabet_size_min] = _breakpoints(
            self.alphabet_size_min, scale=self.std) + self.mean
        self.card_to_bkpt_middle_[self.alphabet_size_min] = _bin_medians(
            self.alphabet_size_min, scale=self.std) + self.mean
コード例 #2
0
    def __init__(self,
                 size_word: int,
                 threshold: int,
                 data_ts: np_ndarray,
                 base_cardinality: int = 2,
                 number_tree: int = 1,
                 indices_partition: list = None,
                 max_card_alphabet: int = 128,
                 boolean_card_max: bool = True):
        """
        Initialization function of the TreeISAX class

        :returns: a forest pointing to one or more iSAX trees
        :rtype: ForestISAX
        """

        # Number of cover contained in the SAX word
        self.size_word = size_word
        # threshold of split node
        self.threshold = threshold
        # Cardinality of each letter at level 1 of the tree
        self.base_cardinality = base_cardinality
        # Max cardinality
        self.max_cardinality = base_cardinality

        self._paa = PiecewiseAggregateApproximation(self.size_word)

        self.forest = {}
        self.number_tree = number_tree

        self.indices_partition = indices_partition

        self._init_trees(data_ts, max_card_alphabet, boolean_card_max)
コード例 #3
0
def genListPAA(instances_nor, windowSize, timestamp):
    paa = PiecewiseAggregateApproximation(n_segments=windowSize)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(instances_nor))

    return {
        "sketchInstances": list(paa_dataset_inv[0].ravel()),
        "timestamp": timestamp
    }
コード例 #4
0
def saa_pax(dataset, title):
    """
    Show the graph of PAA and SAX of time series data
    :param dataset: time series of a stock
    :return:
    """
    n_ts, sz, d = 1, 100, 1
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

    # SAX transform
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))

    # 1d-SAX transform
    n_sax_symbols_avg = 8
    n_sax_symbols_slope = 8
    one_d_sax = OneD_SymbolicAggregateApproximation(
        n_segments=n_paa_segments,
        alphabet_size_avg=n_sax_symbols_avg,
        alphabet_size_slope=n_sax_symbols_slope)
    one_d_sax_dataset_inv = one_d_sax.inverse_transform(
        one_d_sax.fit_transform(dataset))

    plt.figure()
    plt.subplot(2, 2, 1)  # First, raw time series
    plt.plot(dataset[0].ravel(), "b-")
    plt.title("Raw time series " + title)

    plt.subplot(2, 2, 2)  # Second, PAA
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(paa_dataset_inv[0].ravel(), "b-")
    plt.title("PAA " + title)

    plt.subplot(2, 2, 3)  # Then SAX
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(sax_dataset_inv[0].ravel(), "b-")
    plt.title("SAX, %d symbols" % n_sax_symbols)

    plt.subplot(2, 2, 4)  # Finally, 1d-SAX
    plt.plot(dataset[0].ravel(), "b-", alpha=0.4)
    plt.plot(one_d_sax_dataset_inv[0].ravel(), "b-")
    plt.title("1d-SAX, %d symbols (%dx%d)" %
              (n_sax_symbols_avg * n_sax_symbols_slope, n_sax_symbols_avg,
               n_sax_symbols_slope))

    plt.tight_layout()
    plt.show()
コード例 #5
0
def test_serialize_paa():
    X = _get_random_walk()
    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)

    _check_not_fitted(paa)

    paa.fit(X)

    _check_params_predict(paa, X, ['transform'])
コード例 #6
0
def ApplyPaa(n_paa_segments, df, ckt):
    circuito = ckt
    print("Quantidade de segmentos de PAA: {}".format(n_paa_segments))
    paa = PiecewiseAggregateApproximation(n_paa_segments)
    scaler = TimeSeriesScalerMeanVariance()
    dadosPaa = df
    for i in range(0, len(df)):
        dataset = scaler.fit_transform(df[i])
        dadosPaa[i] = paa.inverse_transform(paa.fit_transform(dataset))[0]
    dadosPaa = dadosPaa.T

    return dadosPaa
コード例 #7
0
    def transform(self, x, y=None):

        x_new = []
        for i, time_series in enumerate(x):
            temp = []
            for j, dim in enumerate(time_series):
                if eval(self.paa):
                    paas_ = []
                    for seg in self.segs_:
                        s = int((dim.shape[0]) * seg)
                        if s < 1:
                            continue
                        #print(f"Compression: {seg}")
                        paa_per_seg = PiecewiseAggregateApproximation(n_segments=s)\
                                .fit_transform(dim).flatten()
                        paas_.extend(extract_stats(paa_per_seg))

                    temp.extend(paas_)
                else:
                    temp.extend(extract_stats(dim))
            x_new.append(temp)

        x_new = np.asarray(x_new)
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean.fit(x_new)
        x_new = imp_mean.transform(x_new)

        return np.asarray(x_new)
コード例 #8
0
ファイル: _isax.py プロジェクト: luk-f/pyCFOFiSAX
    def fit(self, X):
        """
        Prepares the data for encoding *i*\ SAX according to``PiecewiseAggregateApproximation``

        :returns: Received data for encoding, defined by ``tslearn``
        :rtype: numpy.ndarray of PiecewiseAggregateApproximation
        """

        return PiecewiseAggregateApproximation.fit(self, X)
コード例 #9
0
def getStdData(originData):
    n_paa_segments = 120  #一天分成4份,每6个小时整合为一段
    paa_data = PiecewiseAggregateApproximation(
        n_segments=n_paa_segments).fit_transform(originData)
    #进行平均值归一化
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    dataset = scaler.fit_transform(paa_data)
    dataset = dataset.reshape(dataset.shape[0], dataset.shape[1])
    return dataset
コード例 #10
0
def get_paa_transformation(df, features_to_compute='probability', segments=10):
    """
    Re sort dataframe station / ts
    Aggr time serie for each station
    Take the mean of each segment
    If the time serie can't be divide by segment. We add the last mean agg.
    df : DataFrame
    features_to_compute : string - column's name of the features we want to agg
    semgnets : int - number of point we want to agg.
    """
    paa_list_result = []
    df = df.reset_index()
    df = df.sort_values(['station', 'ts'])

    for station in df.station.unique():
        data = df[df.station == station]
        n_paa_segments = round((len(data) * segments / 100) - 0.5)
        paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
        paa_inv_transf = np.repeat(paa.fit_transform(
            data[features_to_compute].values)[0],
                                   segments,
                                   axis=0)

        if len(data) != len(paa_inv_transf):
            nb_to_add = len(data) - len(paa_inv_transf)
            value_to_add = np.repeat(np.mean(
                data[features_to_compute].values[-nb_to_add:]),
                                     nb_to_add,
                                     axis=0)  # Take the last X one and mean it
            result = np.append(
                paa_inv_transf,
                value_to_add)  # Append regular paa and last segment mean
            paa_list_result.extend(result)

        else:
            result = paa_inv_transf
            paa_list_result.extend(result)

    df['paa'] = paa_list_result
    df['paa'] = df['paa'].astype('float')
    df = df.sort_values(['ts', 'station'])
    df = df.set_index('ts')
    return df
コード例 #11
0
ファイル: _isax.py プロジェクト: luk-f/pyCFOFiSAX
    def _transform(self, X, card):
        """
        Transforms ``X`` data in parameter first into PAA and then in cardinate cardinality ``card``.

        :param numpy.ndarray X: Data to transform
        :param int card: Cardinality to use for processing

        :returns: Transformed data in SAX
        :rtype: numpy.ndarray
        """

        X_paa = PiecewiseAggregateApproximation._transform(self, X)
        return self._transform_paa_to_isax(X_paa, card)
コード例 #12
0
def ApplyPaa(n_paa_segments,df):
    '''
    Aplica o PAA no dataframe fornecido.

    :param n_paa_segments: quantidade de segmento do PAA para redução de dados
    :param df: dataframe com dados em que se deseja aplicar o PAA
    :return: df após aplicação do PAA
    '''
    df = df.values.T.tolist()
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    dadosPaa = scaler.fit_transform(df)
    print("Quantidade de segmentos de PAA: {}".format(n_paa_segments))
    paa = PiecewiseAggregateApproximation(n_paa_segments)
    dadosPaa = paa.inverse_transform(paa.fit_transform(dadosPaa))

    df = pd.DataFrame()

    for i in range(len(dadosPaa.T)):
        for j in range(len(dadosPaa.T[0])):
            df[j] = dadosPaa.T[i][j]

    return df
コード例 #13
0
ファイル: _isax.py プロジェクト: luk-f/pyCFOFiSAX
    def transform_paa(self, X):
        """
        Prepares the ``X`` data provided in parameter for encoding``tslearn``.
        Then transforms ``X`` data into parameter in PAA.

        :param numpy.ndarray X: Data to transform

        :returns: Transformed data in PAA
        :rtype: numpy.ndarray
        """

        X_ = to_time_series_dataset(X)
        return PiecewiseAggregateApproximation._transform(self, X_)
コード例 #14
0
ファイル: test_piecewise.py プロジェクト: page1/tslearn
def test_paa():
    unfitted_paa = PiecewiseAggregateApproximation(n_segments=3)
    data = [[-1., 2., 0.1, -1., 1., -1.], [1., 3.2, -1., -3., 1., -1.]]
    np.testing.assert_raises(NotFittedError, unfitted_paa.distance, data[0],
                             data[1])

    paa_est = unfitted_paa
    n, sz, d = 2, 10, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, sz, d)
    paa_repr = paa_est.fit_transform(X)
    np.testing.assert_allclose(paa_est.distance(X[0], X[1]),
                               paa_est.distance_paa(paa_repr[0], paa_repr[1]))
コード例 #15
0
print("Num Stock: ", len(pos_relatedStock))

# Plotting Graph
plt.figure()
graph_idx = 0

# Transform PAA, SAX, 1d-SAX,
for stockCode in pos_relatedStock:

    dataset = dfpivot['v_updownpercent'][stockCode]
    scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series
    dataset = scaler.fit_transform(dataset)

    # PAA transform (and inverse transform) of the data
    n_paa_segments = 10
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

    # SAX transform
    n_sax_symbols = 8
    sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                         alphabet_size_avg=n_sax_symbols)
    sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))

    # 1d-SAX transform
    n_sax_symbols_avg = 8
    n_sax_symbols_slope = 8
    one_d_sax = OneD_SymbolicAggregateApproximation(
        n_segments=n_paa_segments,
        alphabet_size_avg=n_sax_symbols_avg,
        alphabet_size_slope=n_sax_symbols_slope)
コード例 #16
0
ファイル: lastSolution.py プロジェクト: wtysos11/k8sPredictor
    ans = origindata.copy()
    for i in range(len(ans)):
        ans[i] = np.where(ele == ans[i])[0][0]
    del ele
    return ans


#################################################################
# 初始,根据原始数据计算新数据
# 从paa这里

# 需要在聚类前将训练数据划分完毕。ratio必须要精心选择使得paa的值能够成为整数

ratio = 0.9
n_paa_segments = 18
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_mid = paa.fit_transform(stdData[:, :int(ratio * stdData.shape[1])])
paa_mid = paa_mid.reshape(paa_mid.shape[0], paa_mid.shape[1])

first_clus = paa_mid.copy()
for i in range(len(first_clus)):
    first_clus[i] = rankbased(paa_mid[i])

#################################################################
# 第一次聚类使用Birch跑出初始,然后使用Kmeans细分。数据使用rank-base
# 改进:直接使用原始数据,调整Birch的threshold
data = first_clus
s = time.time()
y_pre = Birch(n_clusters=None, threshold=getEpsilon(data,
                                                    0.8)).fit_predict(data)
y_pre = KMeans(n_clusters=max(y_pre) + 1, random_state=0).fit_predict(data)
コード例 #17
0
def test_paa():
    unfitted_paa = PiecewiseAggregateApproximation(n_segments=3)
    data = [[-1., 2., 0.1, -1., 1., -1.], [1., 3.2, -1., -3., 1., -1.]]
    np.testing.assert_raises(ValueError, unfitted_paa.distance, data[0],
                             data[1])
コード例 #18
0
ファイル: OnlyTreand.py プロジェクト: wtysos11/k8sPredictor
# 衡量标准
# 加速:考虑到层次聚类速度很慢(要生成距离矩阵),可以自行实现层次聚类的结果
# 第一步:初步筛查。由于聚类本身的特性,所以有很多的数据其实根本就不需要考虑。
# 直接从每个元素开始对每个元素进行遍历,将其分为多个集合。(已经在一个集合里面的不用访问)所有能访问到的元素都放在一个集合里面。
# 然后再进行层次聚类 可以考虑自己实现
# 问题:没有本质区别
# 加速想法二:使用rank-base对一天的数据进行处理后直接Kmeans分100类,再对每一类进行区间聚类
#################################################################
# 初始,根据原始数据计算新数据
# 从paa这里

# 需要在聚类前将训练数据划分完毕。ratio必须要精心选择使得paa的值能够成为整数

ratio = 0.9
n_paa_segments = 18
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
originData = stdData[:, :int(ratio * stdData.shape[1])]  #训练部分已经知道的原始数据
paa_mid = paa.fit_transform(originData)
paa_mid = paa_mid.reshape(paa_mid.shape[0], paa_mid.shape[1])
baseData = paa.inverse_transform(paa_mid)  #提取基线数据
restData = originData - baseData  # 计算得到残差数据

# 模式提取(直接加和取平均后求rank-base处理,或者再做标准化进行SAX处理)
# 初步想法:将每天24小时的流量重复叠加取平均,进行rank-base处理,然后跑MSE用Kmeans进行100聚类
# 想法二:在自己
# 对于100聚类中的每个聚类,再跑层次聚类进行细分,最小调到1。聚类结果衡量用类内最大相似度来进行衡量(有多不相近)
# 使用SAX的dayPattern

# 做法01:使用SAX提取前三天的残差信息,进行20聚类。对每个聚类内部跑complete,0.5的层次聚类。考虑到500量级要跑3分钟,平均大约是一个小时。
from sklearn.cluster import AgglomerativeClustering
import time
コード例 #19
0
class ForestISAX:
    """
    ForestISAX class containing one or more trees and pretreatment functions on the data contained
    in these trees

    :param int size_word: The size of the SAX words
    :param int threshold: The maximum threshold of nodes
    :param numpy.ndarray data_ts: The sequences to be inserted to extract the stats
    :param int base_cardinality: The smallest cardinality for encoding *i*\ SAX
    :param int number_tree: The number of TreeISAX trees in the forest
    :param list indices_partition: a list of index list where, for each tree, specifies the indices of
    sequences to be inserted
    :param int max_card_alphabet: if ``boolean_card_max == True``, the maximum cardinality of encoding *i*\ SAX in
    each of the trees
    :param boolean boolean_card_max: if ``== True``, Defines a maximum cardinality for encoding *i*\ SAX
    Sequences in each of the trees

    :ivar list length_partition: The length of the SAX words in each tree (``== [size_word]`` if ``number_tree
    == 1``)
    """
    def __init__(self,
                 size_word: int,
                 threshold: int,
                 data_ts: np_ndarray,
                 base_cardinality: int = 2,
                 number_tree: int = 1,
                 indices_partition: list = None,
                 max_card_alphabet: int = 128,
                 boolean_card_max: bool = True):
        """
        Initialization function of the TreeISAX class

        :returns: a forest pointing to one or more iSAX trees
        :rtype: ForestISAX
        """

        # Number of cover contained in the SAX word
        self.size_word = size_word
        # threshold of split node
        self.threshold = threshold
        # Cardinality of each letter at level 1 of the tree
        self.base_cardinality = base_cardinality
        # Max cardinality
        self.max_cardinality = base_cardinality

        self._paa = PiecewiseAggregateApproximation(self.size_word)

        self.forest = {}
        self.number_tree = number_tree

        self.indices_partition = indices_partition

        self._init_trees(data_ts, max_card_alphabet, boolean_card_max)

    def _init_trees(self, data_ts: np_ndarray, max_card_alphabet: int,
                    boolean_card_max: bool):
        """
        Function that initializes the tree (s) when creating a ForestISAX object

        :param numpy.ndarray data_ts: The sequences to be inserted to extract the stats
        :param int max_card_alphabet: if ``boolean_card_max == True``, The maximum cardinality of encoding *i*\ SAX
        dans chacun des arbres
        :param boolean boolean_card_max: if ``boolean_card_max == True``, defines maximum cardinality for encoding *i*\ SAX
        sequences in each tree
        """

        if self.number_tree == 1:
            """ if there is only one tree"""

            self.forest[0] = TreeISAX(size_word=self.size_word,
                                      threshold=self.threshold,
                                      data_ts=data_ts,
                                      base_cardinality=self.base_cardinality,
                                      max_card_alphabet=max_card_alphabet,
                                      boolean_card_max=boolean_card_max)
            self.length_partition = [self.size_word]
            self.indices_partition = [list(range(self.size_word))]

        elif self.indices_partition is None:
            """ If there is no tree and the indices are not defined """

            self.length_partition = [int(self.size_word / self.number_tree)
                                     ] * self.number_tree
            for reste in range(self.size_word - sum(self.length_partition)):
                self.length_partition[reste] += 1

            self.indices_partition = []

            for i in range(self.number_tree):
                self.forest[i] = TreeISAX(
                    size_word=self.length_partition[i],
                    threshold=self.threshold,
                    data_ts=data_ts[:, i:self.size_word:self.number_tree],
                    base_cardinality=2,
                    max_card_alphabet=max_card_alphabet,
                    boolean_card_max=boolean_card_max)
                self.indices_partition.append(
                    list(range(i, self.size_word, self.number_tree)))

        else:
            # List of letter number in each tree
            self.length_partition = []
            for part_tmp in self.indices_partition:
                self.length_partition.append(len(part_tmp))

            for i in range(self.number_tree):
                self.forest[i] = TreeISAX(
                    size_word=self.length_partition[i],
                    threshold=self.threshold,
                    data_ts=data_ts[:, self.indices_partition[i]],
                    base_cardinality=2,
                    max_card_alphabet=max_card_alphabet,
                    boolean_card_max=boolean_card_max)

    def index_data(self, new_sequences: np_ndarray):
        """
        The Index_Data function allows you to insert a large number of sequences

        :param numpy.ndarray new_sequences: The sequences to be inserted

        :returns: The number of sequences (sub sequences) insert into the tree (in the trees)
        :rtype: numpy.array
        """

        # Ts Conversion to PAA
        if new_sequences.shape[-1] > 1:
            # add dim to avoid tslearn warning
            new_sequences = new_sequences.reshape(new_sequences.shape + (1, ))
        npaa = self._paa.fit_transform(new_sequences)

        # To count the number of objects in each tree
        cmpt_insert = np_zeros(shape=self.number_tree)

        for i, tree in self.forest.items():
            # Retrieves the indices of the tree, in the multi-tree case
            npaa_tmp = npaa[:, self.indices_partition[i]]
            npaa_tmp = npaa_tmp.reshape(npaa_tmp.shape[:-1])

            for npa_tp in npaa_tmp:
                tree.insert_paa(npa_tp)
                cmpt_insert[i] += 1

        # Returns array[tree_index] with the number of inserted objects for each tree
        return cmpt_insert

    def _count_nodes(self, id_tree: int):
        """
        The _count_nodes function returns the number of nodes and leaf nodes for a given tree.
        Uses :func:`~pyCFOFiSAX.tree_iSAX.TreeISAX.count_nodes_by_tree`.

        :param int id_tree: The tree ID to be analyzed

        :returns: the number of internal nodes, the number of leaf nodes
        :rtype: int, int
        """

        tree = self.forest[id_tree]
        return tree.count_nodes_by_tree()

    def list_nodes(self, id_tree: int, bool_print: bool = False):
        """
        Returns lists of nodes and barycenters of the tree id_tree.Displays statistics on standard output
        if ``bool_print == True``
        Uses :func:`~pyCFOFiSAX.tree_iSAX.TreeISAX.get_list_nodes_and_barycentre`.

        :param int id_tree: The tree ID to be analyzed
        :param boolean bool_print: Displays the nodes stats on the standard output

        :returns: The list of nodes, the list of internal nodes, the list of barycenters
        :rtype: list, list, list
        """

        tree = self.forest[id_tree]
        node_list, node_list_leaf, node_leaf_ndarray_mean = tree.get_list_nodes_and_barycentre(
        )
        if bool_print:
            print(
                f"{len(node_list)} nodes whose {len(node_list_leaf)} leafs in tree {id_tree}"
            )

        return node_list, node_list_leaf, node_leaf_ndarray_mean

    def preprocessing_forest_for_icfof(self,
                                       ntss: np_ndarray,
                                       bool_print: bool = False,
                                       count_num_node: bool = False):
        """
        Allows us to call, for the ``id_tree``  to the pre-treatment for the calculation *i*\ CFOF

        :param ntss: Reference sequences
        :param boolean bool_print: if True, displays the times of each pre-treatment step
        :param boolean count_num_node: if True, count the number of nodes

        :returns: if count_num_node, returns the number of nodes contained in each tree
        :rtype: numpy.array
        """

        total_num_node = np_zeros(self.number_tree)
        for id_tree, tmp_tree in self.forest.items():
            ntss_tmp = ntss[:, self.indices_partition[id_tree]]
            total_num_node[id_tree] = tmp_tree.preprocessing_for_icfof(
                ntss_tmp, bool_print=bool_print, count_num_node=count_num_node)

        if count_num_node:
            return total_num_node

    def number_nodes_visited(self, query: np_array, ntss: np_ndarray):
        """
        Count the number of average visited nodes in each tree for calculating the approximation.

        :param numpy.array query: The sequence to be evaluated
        :param numpy.ndarray ntss: Reference sequences

        :returns: Returns the number of nodes visited in each tree for the approximation *i*\ CFOF
        :rtype: numpy.array
        """

        total_num_node = np_zeros(self.number_tree * 2)

        for id_tree, tmp_tree in self.forest.items():

            sub_query = query[self.indices_partition[id_tree]]
            ntss_tmp = np_array(ntss)[:, self.indices_partition[id_tree]]

            total_num_node[id_tree], total_num_node[self.number_tree + id_tree] = \
                tmp_tree.number_nodes_visited(sub_query, ntss_tmp)

        return total_num_node
EDist_train = []
for i in range(len(y_train)):
    for j in range(len(y_train)):
        dist1 = np.sqrt(
            np.sum((np.array(X_train[i, :]) - np.array(X_train[j, :]))**2))
        EDist_train.append(dist1)

EDist_train = np.array(EDist_train)
EDist_train.resize(y_train.shape[0], int(len(EDist_train) / y_train.shape[0]))
EDist_test = np.array(EDist_test)
EDist_test.resize(y_test.shape[0], int(len(EDist_test) / y_test.shape[0]))

#PAA transform + PAA feature extraction

paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
Xtrain_paa = paa.inverse_transform(paa.fit_transform(X_train))
Xtest_paa = paa.inverse_transform(paa.fit_transform(X_test))

PAA_test = Xtest_paa[:, :, 0]
PAA_train = Xtrain_paa[:, :, 0]
'''
#PAA distance calculation

PAADist_train = []
PAADist_test = []

for i in range(len(y_train)):
    for j in range(len(y_train)):
        dist3 = paa.distance(Xtrain_paa[i,:],Xtest_paa[j,:])
        PAADist_train.append(dist3)
コード例 #21
0
                stdData[index][vi] = -1*maxNum
            else:
                stdData[index][vi] = maxNum


# 2.对去除后的数据进行归一化处理
#再进行一次归一化
from tslearn.preprocessing import TimeSeriesScalerMinMax
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
originStdData = stdData # 保存,为了日后恢复
stdData = scaler.fit_transform(stdData)

# 3.然后进行PAA处理,得到基线和残余值
from tslearn.piecewise import PiecewiseAggregateApproximation
n_paa_segments = 20
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_mid = paa.fit_transform(stdData)
paa_inv = paa.inverse_transform(paa_mid)
paa_inv = paa_inv.reshape(paa_inv.shape[0],paa_inv.shape[1])

# 4.对PAA后的数据进行简单k-means,聚类数量不超过10,分数按照CH分数判断,选出最大的
# 再进行rank-base处理,然后做简单聚类
from sklearn.cluster import MiniBatchKMeans,KMeans,DBSCAN,SpectralClustering,Birch
from sklearn.metrics import calinski_harabasz_score,davies_bouldin_score

n_cluster = 1000
s = time.time()
km = KMeans(n_clusters = n_cluster,random_state = 0)
y_pre = km.fit_predict(paa_inv)
e = time.time()
print(e-s,"s")
        
df_price = df_price.loc[:, day_features].replace(np.nan,0).values
# Standardizing the features
df_price = StandardScaler().fit_transform(df_price)
# add columns' name 
df_price = pd.DataFrame(df_price, columns = day_features)

dataset = df_price.values
print("price feature sample: ")
print(df_price.head())


# PAA transformation
# PAA transform (and inverse transform) of the data
n_paa_segments = 3
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_list = []
for item in df_price.values:
    item = item.reshape((1,5,1))
    paa_price_inv = paa.inverse_transform(paa.fit_transform(item))
    paa_list.append(paa_price_inv)
paa_array = np.array(paa_list)

paa_data = paa_array.reshape(1904, 5)
paa_df = pd.DataFrame(paa_data, columns = day_features)
print("save time series data after PAA")
paa_df.to_csv("./paa_stock_data_time_series.csv", sep=',', encoding='utf-8')
print("PAA sample: ")
print(paa_df.head())

コード例 #23
0
ファイル: KLSE PAA & SAX.py プロジェクト: coperli/Data-Mining
plt.xlabel("Day")
plt.ylabel('Scaled Price')
plt.title(
    'Stock Fluctuations of 4 Renowned Telco Companies from Jan to Mar 2019')
plt.legend(loc='upper right')

# In[ ]:

# Performing PAA and SAX.

from tslearn.piecewise import PiecewiseAggregateApproximation
from tslearn.piecewise import SymbolicAggregateApproximation, OneD_SymbolicAggregateApproximation

n_paa_segments = 8
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
Digi_PAA_n8 = paa.inverse_transform(paa.fit_transform(Digi_Scaled))

n_sax_symbols = 8
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
Digi_SAX_n8 = sax.inverse_transform(sax.fit_transform(Digi_Scaled))

n_paa_segments = 16
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
Digi_PAA_n16 = paa.inverse_transform(paa.fit_transform(Digi_Scaled))

n_sax_symbols = 16
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
Digi_SAX_n16 = sax.inverse_transform(sax.fit_transform(Digi_Scaled))