Esempio n. 1
0
def sax_similarity(data, seq_len):
    from tslearn.piecewise import SymbolicAggregateApproximation

    print('|--- Calculating the pairwise distance!')

    ppa_segmet = int(data.shape[0] / seq_len)
    sax_ins = SymbolicAggregateApproximation(n_segments=ppa_segmet, alphabet_size_avg=10)

    sax_repre = sax_ins.fit_transform(np.transpose(data))

    sax_mx_dist = np.zeros(shape=(data.shape[1], data.shape[1]))

    for i in range(data.shape[1]):
        for j in range(data.shape[1]):
            sax_mx_dist[i, j] = sax_ins.distance_sax(sax_repre[i], sax_repre[j])

    return sax_mx_dist
Esempio n. 2
0
def main():
    #FOR NOAA DB
    influx_url = "http://localhost:8086/query?db=" + dbname + \
                    "&epoch=ms&q=SELECT+%22water_level%22+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms"

    r = requests.get(influx_url)
    json_dict = json.loads(r.content)

    data = json_dict["results"][0]["series"][0]["values"]
    print(data[0:5])
    
##    #NOTE:just for NOAA h2o_feet
    time_interval = data[2][0] - data[0][0]
    print("time interval:", time_interval)
   
    lst2 = [item[1] for item in data]
    n_segments = len(lst2)

    print(max(lst2),min(lst2))
    
    original_data_size = len(lst2)
    print("original data size:", original_data_size)
    
    alphabet_size_avg = math.ceil(max(lst2)-min(lst2))
    print("alphabet size avg:", alphabet_size_avg)


    ## a list of sample ratios.
    ## Want to select the min ratio within the similarity range.
    ratiolist = [0.025,0.05,0.1,0.15,0.2,0.3,0.4,0.5,0.6]
    sizelist = []
    distlist = []
    
    for ratio in ratiolist:
        print()
        print("ratio:",ratio)
            
        #generate sample data
        sample_size = math.floor(original_data_size * ratio)
        sizelist.append(sample_size)
        print("sample_size:",sample_size)

       #NOAA DB: h2o_feet
        sample_url = "http://localhost:8086/query?db=" + dbname + \
                    "&epoch=ms&q=SELECT+sample%28%22water_level%22%2C"+str(sample_size) + \
                    "%29+FROM+%22h2o_feet%22+WHERE+time+%3E%3D+1440658277944ms+and+time+%3C%3D+1441435694328ms"
        
        r2 = requests.get(sample_url)
        json_dict2 = json.loads(r2.content)
        sampled_data = json_dict2["results"][0]["series"][0]["values"] # [[time, value], ...]
        
        sample = [item[1] for item in sampled_data] #[value,...]

        #fill the sample data with a linear model
        start_x = data[0][0]
        end_x = data[-1][0]
        current_x = start_x
        current_loc = 0
        
        slope = (sampled_data[current_loc][1]-sampled_data[current_loc+2][1])\
                /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])      ##NOTE!
        intersection = sampled_data[current_loc][1]-slope*sampled_data[current_loc][0]

        sample_fit = []
        end_sample_x = sampled_data[-1][0]

        while current_x <= end_sample_x:
            if current_x >= sampled_data[current_loc+1][0] and current_loc+1 < len(sampled_data)-2:  ##NOTE: -2 !! CHANGE TO -1 LATER
                current_loc+=1
                ##NOTE: +2 was just for h2o_feet
                if (sampled_data[current_loc][0] - sampled_data[current_loc+1][0]) == 0:
    
                    slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                            /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])
                else:
                    slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                            /(sampled_data[current_loc][0] - sampled_data[current_loc+2][0])

                    
                intersection = sampled_data[current_loc][1] - slope*sampled_data[current_loc][0]
            
            
            sample_fit.append([current_x, slope*current_x+intersection])
            current_x += time_interval #1000ms
           
        #chop the original data to match the linear fit sample data.
        chopped_data = []
        for item in data:
            if item[0]>= sample_fit[0][0] and item[0] <= sample_fit[-1][0]:
                chopped_data.append(item)
        print("size of chopped_data:",len(chopped_data))

        chopped_lst2 = [item[1] for item in chopped_data]
        chopped_len = len(chopped_lst2)

        #build a sax model for chopped original data
        sax = SymbolicAggregateApproximation(chopped_len,alphabet_size_avg)
        scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.)    
        sdb = scalar.fit_transform(chopped_lst2)
        sax_data = sax.transform(sdb)
        s3 = sax.fit_transform(sax_data)

        #build a sax model for linear-fit sampled data
        sample_fit_extract = [item[1] for item in sample_fit]
        fit_sample_data = scalar.fit_transform(sample_fit_extract)
        sax_sample_data = sax.transform(fit_sample_data)
        s4 = sax.fit_transform(sax_sample_data)

        #compute the distance between to dataset to calculate the similarity       
        dist = sax.distance_sax(s3[0], s4[0])
        print("distance:", dist)
        norm_dist = 1000*dist/chopped_len
        distlist.append(norm_dist)
        print("normalized distance: {:.4f}".format(norm_dist))

    plotdist(ratiolist,distlist)
Esempio n. 3
0
rows, cols = data.shape
print(rows, cols)

# PAA transform (and inverse transform) of the data
n_paa_segments = 10
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_dataset_inv = paa.inverse_transform(paa.fit_transform(dataset))

# SAX transform
n_sax_symbols = 8
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments,
                                     alphabet_size_avg=n_sax_symbols)
sax_dataset_inv = sax.inverse_transform(sax.fit_transform(dataset))
sax_data = sax.fit_transform(data)
print(sax_data)
d = sax.distance_sax(sax_data[0], sax_data[1])
print(d)
exit()

# 1d-SAX transform
n_sax_symbols_avg = 8
n_sax_symbols_slope = 8
one_d_sax = OneD_SymbolicAggregateApproximation(
    n_segments=n_paa_segments,
    alphabet_size_avg=n_sax_symbols_avg,
    alphabet_size_slope=n_sax_symbols_slope)
one_d_sax_dataset_inv = one_d_sax.inverse_transform(
    one_d_sax.fit_transform(dataset))

plt.figure()
plt.subplot(2, 2, 1)  # First, raw time series
Esempio n. 4
0
def main():
    # fetch original data
    #for test_quarter db
    ##    influx_url = "http://localhost:8086/query?db=" + dbname + \
    ##                 "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546329600000ms+and+time+%3C%3D+1546329900000ms"

    #FOR NOAA DB
    ##    influx_url = "http://localhost:8086/query?db=" + dbname + \
    ##                 "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1439856000000ms+and+time+%3C%3D+1439992520000ms+and%28%22location%22+%3D+%27santa_monica%27%29"
    # For test3
    influx_url = "http://localhost:8086/query?db=" + dbname + \
                 "&epoch=ms&q=SELECT+%22degrees%22+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546355705400ms+and+time+%3C%3D+1548969305400ms"

    r = requests.get(influx_url)
    json_dict = json.loads(r.content)

    data = json_dict["results"][0]["series"][0]["values"]
    ##    print(data[0])
    ##    print(data[1])
    time_interval = data[1][0] - data[0][0]  # consistant time interval
    print("time interval: ", time_interval)

    lst2 = [item[1] for item in data]
    n_segments = len(lst2)

    print("original data size", len(lst2))
    alphabet_size_avg = 20

    #generate sample data
    sample_size = 20
    ##    sample_url = "http://localhost:8086/query?db="+dbname+\
    ##                 "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\
    ##                 "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546329600000ms+and+time+%3C%3D+1546329900000ms"
    # test3 sample (sin pattern)
    sample_url = "http://localhost:8086/query?db="+dbname+\
             "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\
             "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1546355705400ms+and+time+%3C%3D+1548969305400ms"

    ##    sample_url = "http://localhost:8086/query?db=" + dbname + \
    ##                 "&epoch=ms&q=SELECT+sample%28%22degrees%22%2C" + str(sample_size) +\
    ##                 "%29+FROM+%22h2o_temperature%22+WHERE+time+%3E%3D+1439856000000ms+and+time+%3C%3D+1442612520000ms+and%28%22location%22+%3D+%27santa_monica%27%29"

    r2 = requests.get(sample_url)
    json_dict2 = json.loads(r2.content)
    sampled_data = json_dict2["results"][0]["series"][0][
        "values"]  # [[time, value], ...]

    print("sample length")
    print(len(sampled_data))

    sample = [item[1] for item in sampled_data]  #[value,...]

    #fill the sample data with a linear model
    start_x = data[0][0]
    end_x = data[-1][0]
    current_x = start_x
    current_loc = 0

    slope = (sampled_data[current_loc][1]-sampled_data[current_loc+1][1])\
            /(sampled_data[current_loc][0] - sampled_data[current_loc+1][0])
    intersection = sampled_data[current_loc][
        1] - slope * sampled_data[current_loc][0]

    sample_fit = []
    end_sample_x = sampled_data[-1][0]

    while current_x <= end_sample_x:
        if current_x >= sampled_data[
                current_loc +
                1][0] and current_loc + 1 < len(sampled_data) - 1:
            current_loc += 1
            slope = (sampled_data[current_loc] [1]-sampled_data[current_loc+1][1]) \
                    /(sampled_data[current_loc][0] - sampled_data[current_loc+1][0])
            intersection = sampled_data[current_loc][
                1] - slope * sampled_data[current_loc][0]

        sample_fit.append([current_x, slope * current_x + intersection])
        current_x += time_interval  #1000ms

    #chop the original data to match the linear fit sample data.
    chopped_data = []
    for item in data:
        if item[0] >= sample_fit[0][0] and item[0] <= sample_fit[-1][0]:
            chopped_data.append(item)
    print("len")
    print(len(sample_fit), len(chopped_data))
    chopped_lst2 = [item[1] for item in chopped_data]
    chopped_len = len(chopped_lst2)

    #build a sax model for chopped original data
    sax = SymbolicAggregateApproximation(chopped_len, alphabet_size_avg)
    scalar = TimeSeriesScalerMeanVariance(mu=0., std=1.)
    sdb = scalar.fit_transform(chopped_lst2)
    sax_data = sax.transform(sdb)
    s3 = sax.fit_transform(sax_data)

    #build a sax model for linear-fit sampled data
    sample_fit_extract = [item[1] for item in sample_fit]
    fit_sample_data = scalar.fit_transform(sample_fit_extract)
    sax_sample_data = sax.transform(fit_sample_data)
    s4 = sax.fit_transform(sax_sample_data)

    #compute the distance between to dataset to calculate the similarity
    print("distance")
    dist = sax.distance_sax(s3[0], s4[0])
    print(dist)
    print("normalized distance")
    print(dist / chopped_len)

    #plot the three dataset
    plot(sample_fit, sampled_data, lst2)