Exemple #1
0
def ComputeSax(sample_data, sample_data2):

    sample_data = sample_data.as_matrix()
    sample_data2 = sample_data2.as_matrix()

    #########################################
    # SAX - Symbolic aggregate approximation
    #http://www.cs.ucr.edu/~eamonn/SAX.pdf
    ##########################################
    #PARAMETERS:
    #W: The number of PAA segments representing the time series - aka the len()
    # of the string representing the timeseries - useful for dimensionality reduction
    #Alphabet size: Alphabet size (e.g., for the alphabet = {a,b,c} = 3)

    downsample_ratio = 200
    word_length = len(sample_data[:, 1]) / downsample_ratio
    alphabet_size = 7

    s = SAX(word_length, alphabet_size)

    mic_distances = []
    for mic in range(1, 5):
        (x1String, x1Indices) = s.to_letter_rep(sample_data[:, mic])
        (x2String, x2Indices) = s.to_letter_rep(sample_data2[:, mic])

        #print x1String

        x1x2ComparisonScore = s.compare_strings(x1String, x2String)

        mic_distances.append(x1x2ComparisonScore)
        #print "Mic: " + str(mic) + ", distance= " + str(x1x2ComparisonScore)
    return mic_distances
Exemple #2
0
class TestSAX(object):
    def setUp(self):
        # All tests will be run with 6 letter words
        # and 5 letter alphabet
        self.sax = SAX(6, 5, 1e-6)

    def test_to_letter_rep(self):
        arr = [7, 1, 4, 4, 4, 4]
        (letters, indices) = self.sax.to_letter_rep(arr)
        assert letters == 'eacccc'

    def test_long_to_letter_rep(self):
        long_arr = [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10,
            100
        ]
        (letters, indices) = self.sax.to_letter_rep(long_arr)
        assert letters == 'bbbbce'

    def test_compare_strings(self):
        base_string = 'aaabbc'
        similar_string = 'aabbbc'
        dissimilar_string = 'ccddbc'
        similar_score = self.sax.compare_strings(base_string, similar_string)
        dissimilar_score = self.sax.compare_strings(base_string,
                                                    dissimilar_string)
        assert similar_score < dissimilar_score
Exemple #3
0
class TestSAX(object):
    def setUp(self):
        # All tests will be run with 6 letter words
        # and 5 letter alphabet
        self.sax = SAX(6, 5, 1e-6)

    def test_to_letter_rep(self):
        arr = [7, 1, 4, 4, 4, 4]
        (letters, indices, letter_boundries) = self.sax.to_letter_rep(arr)
        assert letters == 'eacccc'

    def test_long_to_letter_rep(self):
        long_arr = [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10,
            100
        ]
        (letters, indices, letter_boundries) = self.sax.to_letter_rep(long_arr)
        assert letters == 'bbbbce'

    def test_compare_strings(self):
        base_string = 'aaabbc'
        similar_string = 'aabbbc'
        dissimilar_string = 'ccddbc'
        similar_score = self.sax.compare_strings(base_string, similar_string)
        dissimilar_score = self.sax.compare_strings(base_string,
                                                    dissimilar_string)
        assert similar_score < dissimilar_score

    def test_from_letter_rep(self):
        arr = [7, 1, 4, 4, 4, 4]
        (letters, indices, letter_boundries) = self.sax.to_letter_rep(arr)
        reconstructed = self.sax.from_letter_rep(letters, indices,
                                                 letter_boundries)
        assert allclose(reconstructed, [6.21, 1.78, 4.0, 4.0, 4.0, 4.0],
                        atol=0.01)

    def test_breakpoints(self):
        assert allclose(self.sax.breakpoints(3), [-0.43, 0.43], atol=0.01)
        assert allclose(self.sax.breakpoints(2), [0], atol=0.01)
        assert allclose(self.sax.breakpoints(20), [
            -1.64, -1.28, -1.04, -0.84, -0.67, -0.52, -0.39, -0.25, -0.13, 0,
            0.13, 0.25, 0.39, 0.52, 0.67, 0.84, 1.04, 1.28, 1.64
        ],
                        atol=0.01)

    def test_interval_centres(self):
        assert allclose(self.sax.interval_centres(2), [-0.67, 0.67], atol=0.01)
        assert allclose(self.sax.interval_centres(3), [-0.96, 0.0, 0.96],
                        atol=0.01)
        assert allclose(self.sax.interval_centres(30), [
            -2.12, -1.64, -1.38, -1.19, -1.03, -0.90, -0.78, -0.67, -0.57,
            -0.47, -0.38, -0.29, -0.21, -0.12, -0.04, 0.04, 0.12, 0.21, 0.29,
            0.38, 0.47, 0.57, 0.67, 0.78, 0.90, 1.03, 1.19, 1.38, 1.64, 2.12
        ],
                        atol=0.01)
Exemple #4
0
class TestSAX(object):
    def setUp(self):
        # All tests will be run with 6 letter words
        # and 5 letter alphabet
        self.sax = SAX(6, 5, 1e-6)

    def test_to_letter_rep(self):
        arr = [7, 1, 4, 4, 4, 4]
        (letters, indices) = self.sax.to_letter_rep(arr)
        assert letters == 'eacccc'

    def test_to_letter_rep_missing(self):
        arr = [7, 1, 4, 4, np.nan, 4]
        (letters, indices) = self.sax.to_letter_rep(arr)
        assert letters == 'eacc-c'

    def test_long_to_letter_rep(self):
        long_arr = [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10,
            100
        ]
        (letters, indices) = self.sax.to_letter_rep(long_arr)
        assert letters == 'bbbbce'

    def test_long_to_letter_rep_missing(self):
        long_arr = [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, np.nan, 1, 1, 6, 6, 6, 6,
            10, 100
        ]
        (letters, indices) = self.sax.to_letter_rep(long_arr)
        assert letters == 'bbb-ce'

    def test_compare_strings(self):
        base_string = 'aaabbc'
        similar_string = 'aabbbc'
        dissimilar_string = 'ccddbc'
        similar_score = self.sax.compare_strings(base_string, similar_string)
        dissimilar_score = self.sax.compare_strings(base_string,
                                                    dissimilar_string)
        assert similar_score < dissimilar_score

    def test_compare_strings_missing(self):
        assert self.sax.compare_strings('a-b-c-', 'b-c-d-') == 0

    def test_normalize_missing(self):
        # two arrays which should normalize to the same result
        # except one should contain a nan value in place of the input nan value
        incomplete_arr_res = self.sax.normalize([1, 0, 0, 0, 0, 1, np.nan])
        complete_arr_res = self.sax.normalize([1, 0, 0, 0, 0, 1])
        assert np.array_equal(incomplete_arr_res[:-1], complete_arr_res)
        assert np.isnan(incomplete_arr_res[-1])

    def test_normalize_under_epsilon(self):
        array_under_epsilon = self.sax.normalize([1e-7, 2e-7, 1.5e-7])
        assert np.array_equal(array_under_epsilon, [0, 0, 0])
    def _get_SAX_spikes(cls, timeseries, timestamps, treshold):
        """
        Returns spikes counting how many times a timestamp is a maximum
        in a SAX conversion
        """

        # Seconds bethween measurements
        retention = (timestamps[-1] - timestamps[0]) / len(timestamps)

        # Number of entries per window
        entries_per_word = cls.WINDOW_SECONDS_COUNT / retention

        num_windows = len(timeseries) / entries_per_word
        window_size = len(timeseries) / num_windows

        num_symbols = window_size * retention / cls.SECONDS_PER_SYMBOL

        sax_generator = SAX(wordSize=num_symbols,
                            alphabetSize=cls.ALPHABET_SIZE)

        symbols_per_datapoint = int(
            round(cls.SECONDS_PER_SYMBOL / float(retention)))

        # Convert timeseries into SAX notation
        words, intervals = sax_generator.sliding_window(
            timeseries, num_windows, .8)

        # Times index i is a maximal value
        maximum_count = {i: 0 for i in xrange(len(timeseries))}
        # Times index i is passed by a window
        window_count = {i: 0 for i in xrange(len(timeseries))}

        # Count in how many windows a timestamp is a local maximum
        for i in xrange(len(words)):
            word = words[i]
            interval = intervals[i]

            for j in xrange(len(word)):
                index = j * symbols_per_datapoint + interval[0]
                if word[j] == string.ascii_lowercase[cls.ALPHABET_SIZE - 1]:
                    maximum_count[index] += 1
                window_count[index] += 1

        spikes = {}
        for key, value in maximum_count.iteritems():
            if value == window_count[key] and value and \
               timeseries[key] > treshold:
                val = timeseries[key]
                spikes[timestamps[key]] = cls._get_basic_spike_prio(
                    val, treshold)

        return spikes
def saxify_and_export(df, csvf, alphabet=5):
    nrows, ncols = df.shape
    sample_size = ncols - 1
    sax = SAX(sample_size, alphabet, 1e-6)
    cols = ['label', 'sax']
    nv = []
    for i in range(nrows):
        values = df.iloc[i, 1:].values.tolist()
        v = {}
        v['label'] = int(df.iloc[i, 0])

        letters, _ = sax.to_letter_rep(values)
        v['sax'] = letters
        nv.append(v)
    return pd.DataFrame(nv, columns=cols).to_csv(csvf, index=False)
def saxify_and_export(df, csvf, alphabet=5):
    nrows, ncols = df.shape
    sample_size = ncols - 1
    sax = SAX(sample_size, alphabet, 1e-6)
    cols = ['label', 'sax']
    nv = []
    for i in range(nrows):
        values = df.iloc[i, 1:].values.tolist()
        v = {}
        v['label'] = int(df.iloc[i, 0])

        letters, _ = sax.to_letter_rep(values)
        v['sax'] = letters
        nv.append(v)
    return pd.DataFrame(nv, columns=cols).to_csv(csvf, index=False)
Exemple #8
0
class TestSAX(object):
    def setUp(self):
        # All tests will be run with 6 letter words
        # and 5 letter alphabet
        self.sax = SAX(6, 5, 1e-6)

    def test_to_letter_rep(self):
        arr = [7,1,4,4,4,4]
        (letters, indices) = self.sax.to_letter_rep(arr)
        assert letters == 'eacccc'

    def test_to_letter_rep_missing(self):
        arr = [7,1,4,4,np.nan,4]
        (letters, indices) = self.sax.to_letter_rep(arr)
        assert letters == 'eacc-c'

    def test_long_to_letter_rep(self):
        long_arr = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,6,6,6,6,10,100]
        (letters, indices) = self.sax.to_letter_rep(long_arr)
        assert letters == 'bbbbce'

    def test_long_to_letter_rep_missing(self):
        long_arr = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,np.nan,1,1,6,6,6,6,10,100]
        (letters, indices) = self.sax.to_letter_rep(long_arr)
        assert letters == 'bbb-ce'

    def test_compare_strings(self):
        base_string = 'aaabbc'
        similar_string = 'aabbbc'
        dissimilar_string = 'ccddbc'
        similar_score = self.sax.compare_strings(base_string, similar_string)
        dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string)
        assert similar_score < dissimilar_score

    def test_compare_strings_missing(self):
        assert self.sax.compare_strings('a-b-c-', 'b-c-d-') == 0

    def test_normalize_missing(self):
        # two arrays which should normalize to the same result
        # except one should contain a nan value in place of the input nan value
        incomplete_arr_res = self.sax.normalize([1,0,0,0,0,1,np.nan])
        complete_arr_res = self.sax.normalize([1,0,0,0,0,1])
        assert np.array_equal(incomplete_arr_res[:-1], complete_arr_res)
        assert np.isnan(incomplete_arr_res[-1])
    def test_normalize_under_epsilon(self):
        array_under_epsilon = self.sax.normalize([1e-7, 2e-7, 1.5e-7])
        assert np.array_equal(array_under_epsilon, [0,0,0])
    def _get_SAX_spikes(cls, timeseries, timestamps, treshold):
        """
        Returns spikes counting how many times a timestamp is a maximum
        in a SAX conversion
        """

        # Seconds bethween measurements
        retention = (timestamps[-1] - timestamps[0]) / len(timestamps)

        # Number of entries per window
        entries_per_word = cls.WINDOW_SECONDS_COUNT / retention

        num_windows = len(timeseries) / entries_per_word
        window_size = len(timeseries) / num_windows

        num_symbols = window_size * retention / cls.SECONDS_PER_SYMBOL

        sax_generator = SAX(wordSize=num_symbols, alphabetSize=cls.ALPHABET_SIZE)

        symbols_per_datapoint = int(round(cls.SECONDS_PER_SYMBOL / float(retention)))

        # Convert timeseries into SAX notation
        words, intervals = sax_generator.sliding_window(timeseries, num_windows, 0.8)

        # Times index i is a maximal value
        maximum_count = {i: 0 for i in xrange(len(timeseries))}
        # Times index i is passed by a window
        window_count = {i: 0 for i in xrange(len(timeseries))}

        # Count in how many windows a timestamp is a local maximum
        for i in xrange(len(words)):
            word = words[i]
            interval = intervals[i]

            for j in xrange(len(word)):
                index = j * symbols_per_datapoint + interval[0]
                if word[j] == string.ascii_lowercase[cls.ALPHABET_SIZE - 1]:
                    maximum_count[index] += 1
                window_count[index] += 1

        spikes = {}
        for key, value in maximum_count.iteritems():
            if value == window_count[key] and value and timeseries[key] > treshold:
                val = timeseries[key]
                spikes[timestamps[key]] = cls._get_basic_spike_prio(val, treshold)

        return spikes
 def __init__(self,
              segmentLength=20,
              paaSize=5,
              alphabetSize=3,
              upperBound=100,
              lowerBound=-100):
     self.segmentLength = segmentLength
     self.paaSize = paaSize
     self.alphabetSize = alphabetSize
     self.upperBound = upperBound
     self.lowerBound = lowerBound
     self.sax = SAX(wordSize=paaSize,
                    alphabetSize=alphabetSize,
                    lowerBound=lowerBound,
                    upperBound=upperBound,
                    epsilon=1e-6)
     self.grammar = Grammar()
     self.segmentIndexes = []
     self.rule_set = []
     self.tsCount = 0
def sax_kmeans(X, K, wordSize, alphabetSize): 
    '''Cluster by SAX k-means
    
    Args:
        X: 2D np array of dimension (n_households, time)
        K: Number of clusters
        See https://github.com/nphoff/saxpy

    Returns:
        List of K centroids
        List of SAX k-means cluster assignments for each load in X
    '''
    
    np.random.seed(NUM)

    # Initialize to K random centers
    sax = SAX(wordSize=wordSize, alphabetSize=alphabetSize)
    idx = np.random.randint(X.shape[0], size=K)
    xmu =  list(X[idx, :])
    mu = []
    
    for i in range(len(xmu)):
        mu.append(sax.to_letter_rep(xmu[i])[0])   
    oldmu = []

    strX = []
    for i in range(X.shape[0]):
        strX.append(sax.to_letter_rep(X[i])[0])

    #i = 1
    while not has_converged(mu, oldmu):
        oldmu = mu
        # Assign all points in X to clusters
        clusters, mu_ind = cluster_points(X, strX, mu, sax)
        # Reevaluate centers
        mu = reevaluate_centers(oldmu, clusters, sax)

    return mu, mu_ind
Exemple #12
0
class TestSAX(object):
    def setUp(self):
        # All tests will be run with 6 letter words
        # and 5 letter alphabet
        self.sax = SAX(6, 5, 1e-6)

    def test_to_letter_rep(self):
        arr = [7, 1, 4, 4, 4, 4]
        (letters, indices) = self.sax.to_letter_rep(arr)
        assert letters == "eacccc"

    def test_long_to_letter_rep(self):
        long_arr = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 10, 100]
        (letters, indices) = self.sax.to_letter_rep(long_arr)
        assert letters == "bbbbce"

    def test_compare_strings(self):
        base_string = "aaabbc"
        similar_string = "aabbbc"
        dissimilar_string = "ccddbc"
        similar_score = self.sax.compare_strings(base_string, similar_string)
        dissimilar_score = self.sax.compare_strings(base_string, dissimilar_string)
        assert similar_score < dissimilar_score
class SymbolicClustering(object):
    def __init__(self,
                 segmentLength=20,
                 paaSize=5,
                 alphabetSize=3,
                 upperBound=100,
                 lowerBound=-100):
        self.segmentLength = segmentLength
        self.paaSize = paaSize
        self.alphabetSize = alphabetSize
        self.upperBound = upperBound
        self.lowerBound = lowerBound
        self.sax = SAX(wordSize=paaSize,
                       alphabetSize=alphabetSize,
                       lowerBound=lowerBound,
                       upperBound=upperBound,
                       epsilon=1e-6)
        self.grammar = Grammar()
        self.segmentIndexes = []
        self.rule_set = []
        self.tsCount = 0

    def printSegments(self):
        print("\nCurrent Segments:")
        for segmentIndex in self.segmentIndexes:
            segmentIndex.printContent()

    def discretize(self, s):
        """
        @description  : discretize the single seires using modified PAA method.
        ---------
        @param  : s -- timeseries in array format
        -------
        @Returns  : a list of segments which are discretized from the input.
        -------
        """

        n = len(s)
        segments = []
        if n % self.segmentLength != 0:
            raise SegmentsCanNotBeEquallyDivided()
        nSegment = int(n / self.segmentLength)
        for i in range(0, nSegment):
            start = i * self.segmentLength
            end = (i + 1) * self.segmentLength
            if self.tsCount == 0:
                self.segmentIndexes.append(SegmentIndex((start, end)))
            (letters, indices) = self.sax.to_letter_rep_ori(s[start:end])
            segment = Segment(s[start:end], letters, indices,
                              self.segmentIndexes[i])
            self.segmentIndexes[i].addSegment(segment)
            segments.append(segment)
        self.tsCount += 1
        return segments

    def grammar_induction(self, segments):
        """
        @description  : get grammar from segments.
        ---------
        @param  : segments -- a list of segments
        -------
        @Returns  :
        -------
        """
        self.grammar.train_string(segments)
        self.rule_set = self.grammar.get_rule_set()

    def get_frequency_matrix(self):
        """
        @description  : for each segment, generate their frequencies of which are covered by the same grammar rule.
        ---------
        @param  :
        -------
        @Returns  : a two-dimensional matrix that represents the frequency of each segment covered by certain grammar.

        -------
        """

        frequencyMatrix = []
        for segmentIndex in self.segmentIndexes:
            rDict = {}
            for j in range(0, self.tsCount):
                segment = segmentIndex.getSegment(j)
                rule = segment.getRule()
                if rDict.get(rule) == None:
                    rDict[rule] = 1
                else:
                    rDict[rule] = rDict[rule] + 1

            rowFrequency = []
            for j in range(0, self.tsCount):
                rule = segmentIndex.getSegment(j).getRule()
                if rule == self.grammar.root_production:
                    rowFrequency.append(1)
                else:
                    rowFrequency.append(
                        rDict[segmentIndex.getSegment(j).getRule()])

            frequencyMatrix.append(rowFrequency)

        return frequencyMatrix

    def cut_window(self, frequencyMatrix):
        """
        @description  : generate windows with the frequencyMatrix. The change points of the frequency are the cut lines.
        ---------
        @param  : frequencyMatrix -- a two-dimensional matrix
        -------
        @Returns  : a list of windows
        -------
        """

        start = 0
        windows = []
        for now in range(1, len(frequencyMatrix)):
            if frequencyMatrix[now] != frequencyMatrix[start]:
                windows.append(Window(start, now, self.segmentLength))
                start = now
        windows.append(Window(start, len(frequencyMatrix), self.segmentLength))
        return windows

    def generateInitialClusters(self, startIndex, windows):
        """
        @description  : generate initial clusters in each window. The clusters are not 
        overlapped by each other but the sum of them covers all the segments. 
        ---------
        @param  : startIndex -- the start number of p time series.
                  windows  --  the cut window used to generate clusters
        -------
        @Returns  : new windows that each of which contains the generated clusters
        -------
        """

        for window in windows:
            window.initSubsequences(startIndex, self)
            window.initClusters(self)
            window.clustersCombination()
            window.clustersBreakingTie()
            window.clustersProcessMiss()
            window.computeAllDistancesAndCentroids()
        return windows
Exemple #14
0
 def setUp(self):
     # All tests will be run with 6 letter words
     # and 5 letter alphabet
     self.sax = SAX(6, 5, 1e-6)
Exemple #15
0
def sax_rep(word, letter, ary):
    ary = np.asarray(ary)
    sax = SAX(word, letter)
    return sax.to_letter_rep(ary)
Exemple #16
0
def min_dist_sax(t1String, t2String, word, alpha, eps=0.000001):
    s = SAX(word, alpha, eps)
    return s.compare_strings(t1String, t2String)
def convert_sax(ts,word,alpha,eps=0.000001):
    s=SAX(word,alpha,eps)
    (t1String, t1Indices) = s.to_letter_rep(ts)
    return t1String
def min_dist_sax(t1String,t2String,word,alpha,eps=0.000001):
    s=SAX(word,alpha,eps)
    return s.compare_strings(t1String,t2String)
Exemple #19
0
from saxpy import SAX
import os

with open("data.txt") as f:
    data = f.readlines()

s = SAX(32, 10)
for line in data:
    x = []
    for p in line.split():
        x.append(float(p))
    print s.to_letter_rep(x)[0]
from saxpy import SAX
import numpy as np
import matplotlib.pyplot as plt


t=np.linspace(0,15,num=100)
t2=np.linspace(1,16,num=100)

data=np.sin(t)
data2=np.sin(t2)



sax=SAX(wordSize=20)


rep=sax.to_letter_rep(data)
print(rep)

rep2=sax.to_letter_rep(data2)
print(rep2)


plt.plot(data,'b')
plt.plot(data2,'r')
plt.show()
Exemple #21
0
def convert_sax(ts, word, alpha, eps=0.000001):
    s = SAX(word, alpha, eps)
    (t1String, t1Indices) = s.to_letter_rep(ts)
    return t1String
Exemple #22
0

def DrawLines(lines):
    ax = gca()
    for line in lines:
        tline = Line2D((line[0], line[2]), (line[1], line[3]))
        ax.add_line(tline)


n, w, a = read_para(sys.argv[1:])

#1 represent SAX and calculate the frequence

x = Time_series.Time_series_CAR(n)
data = x.tolist()
sax = SAX(w, a, 1e-6)
(letters, indices) = sax.to_letter_rep(data)
frq = sax.symbol_frequency(data)

#2 Dimensionality reduction with linear interprolation

a = np.asarray(data, dtype=np.float64)
newdata = (a + np.random.normal(0, 3, n)).tolist()
nordata = normalize(a[:, np.newaxis], axis=0).ravel()

figure()
lines = WindowSliding.WindowSliding(nordata, Fitting.Fitting,
                                    Fitting.SumofSquaredError)
DrawPlot(nordata, 'Pecewise linear approximation with Sliding Window')
DrawLines(lines)
show()
Exemple #23
0
 def setUp(self):
     # All tests will be run with 6 letter words
     # and 5 letter alphabet
     self.sax = SAX(6, 5, 1e-6)
Exemple #24
0
def sax_rep(word,letter,ary):
    ary = np.asarray(ary)
    sax = SAX(word,letter)
    return sax.to_letter_rep(ary)