Beispiel #1
0
def test_sdist_new():
    a = list(range(5))
    b = a.copy()
    stats = util.calculate_stats(a, b)
    np.testing.assert_almost_equal(util.sdist_new(a, b, 0, stats), 0)


    np.random.seed(1337)
    a = np.random.random(100)
    stats = util.calculate_stats(a, a)
    np.testing.assert_almost_equal(util.sdist_new(a[50:], a, 50, stats), 0)
Beispiel #2
0
def fast_shapelet_discovery(timeseries,
                            labels,
                            m=None,
                            min_len=1,
                            max_len=None):
    if m is None:
        m = np.min([len(x)
                    for x in timeseries])  # Maximum length of a timeserie
    if max_len is None:
        max_len = m
    max_gain, max_gap = 0, 0
    best_shapelet, best_distance, best_L = None, None, None
    cntr = 0
    for ts, label in zip(timeseries, labels):
        print(cntr, '/', len(timeseries))
        cntr += 1

        x = (ts, label)
        stats = {}
        for i, (ts2, label2) in enumerate(zip(timeseries, labels)):
            stats[i] = util.calculate_stats(ts, ts2)

        for l in range(min_len, max_len + 1):  # Possible shapelet lengths
            H = []  # Cache/history
            for i in range(len(ts) - l):  # Possible start positions
                broken = False
                for (L, S) in H:
                    R = util.sdist(ts[i:i + l], S)
                    if util.upperIG(L, R, timeseries, labels) < max_gain:
                        broken = True
                        break  # Continue with next i

                if not broken:
                    L = []
                    for k, (ts2, label2) in enumerate(zip(timeseries, labels)):
                        L.append((util.sdist_new(ts[i:i + l], ts2, i,
                                                 stats[k]), label2))
                    L = sorted(L, key=lambda x: x[0])
                    #print(L)
                    best_ig, tau = find_best_split_point(L)
                    if best_ig < max_gain:
                        best_shapelet = ts[i:i + l]
                        max_gain = best_ig
                        best_L = L
                        best_distance = tau
                        # (1.4578175811448, 1), (nan, 0), (nan, 1), (nan, 0), (nan, 1), ...
                        print('---->', max_gain, best_distance)
                    H.append((L, ts[i:i + l]))

    return best_shapelet, best_distance, best_L, max_gain
Beispiel #3
0
 def evaluate_z_norm_space(self, time_serie, proba=True):
     if self.distance is None:
         if proba:
             return self.class_probabilities
         else:
             return max(self.class_probabilities.items(),
                        key=operator.itemgetter(1))[0]
     else:
         stats = calculate_stats(self.shapelet, time_serie)
         dist = sdist_new(self.shapelet, time_serie, 0, stats)
         if dist <= self.distance:
             return self.left.evaluate_z_norm_space(time_serie, proba=proba)
         else:
             return self.right.evaluate_z_norm_space(time_serie,
                                                     proba=proba)
Beispiel #4
0
    print(tree.shapelet)
    print(tree.distance)

    distances = []

    for ts, label in zip(timeseries, labels):
        d, idx = subsequence_dist(ts, tree.shapelet)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    distances = []

    for ts, label in zip(timeseries, labels):
        stats = calculate_stats(tree.shapelet, ts)
        d = sdist_new(tree.shapelet, ts, 0, stats)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    distances = []

    for ts, label in zip(timeseries, labels):
        stats = calculate_stats(tree.right.shapelet, ts)
        d = sdist_new(tree.right.shapelet, ts, 0, stats)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    # tree.populate_class_probs(timeseries[:-1], labels[:-1])
nr_timeseries = [5, 10, 25, 50, 100]

for nr_timeserie in nr_timeseries:
    for ts_length in ts_lengths:
        print('Timing the distance calculation for', str(nr_timeserie),
              'timeseries of length', str(ts_length))
        sdist_new_times = []
        sdist_new_overhead = []
        sdist_old_times = []
        timeseries, labels = generate_binary_classification_data(
            typical_characteristic, ts_length, nr_timeserie)
        for ts, label in tqdm(zip(timeseries, labels)):
            stats = {}
            start_time = time.time()
            for i, (ts2, label2) in enumerate(zip(timeseries, labels)):
                stats[tuple(ts2)] = util.calculate_stats(ts, ts2)
            sdist_new_overhead.append(time.time() - start_time)

            for l in range(1, ts_length + 1):
                for start in range(len(ts) - l):  # Possible start positions
                    new_dists = []
                    old_dists = []
                    for k, (ts2, label2) in enumerate(zip(timeseries, labels)):
                        start_time = time.time()
                        dist_new = util.sdist_new(ts[start:start + l], ts2,
                                                  start, stats[tuple(ts2)])
                        sdist_new_times.append(time.time() - start_time)
                        new_dists.append((k, dist_new))

                        start_time = time.time()
                        dist_old, idx_old = util.subsequence_dist(
Beispiel #6
0
def test_calculate_stats():
    # Generate two random vectors
    np.random.seed(1337)
    a = np.random.random(500)
    b = np.random.random(500)

    # So calculate_stats is supposed to return 5 arrays:
    # The cumulative sum of a
    # The cumulative sum of a, squared
    # The cumulative sum of b
    # The cumulative sum of b, squared
    # The sum of products of elements in a and b

    # Let's calculate these arrays manually in a slow fashion
    start = time.time()
    s_x = [0]
    s_x_sqr = [0]
    for i in range(1, len(a)+1):
        sum = 0
        sum_sqr = 0
        for x in a[:i]:
            sum += x
            sum_sqr += x**2
        s_x.append(sum)
        s_x_sqr.append(sum_sqr)
    s_x = np.array(s_x)
    s_x_sqr = np.array(s_x_sqr)

    s_y = [0]
    s_y_sqr = [0]
    for i in range(1, len(b)+1):
        sum = 0
        sum_sqr = 0
        for x in b[:i]:
            sum += x
            sum_sqr += x**2
        s_y.append(sum)
        s_y_sqr.append(sum_sqr)
    s_y = np.array(s_y)
    s_y_sqr = np.array(s_y_sqr)

    m_uv = np.zeros((len(a) + 1, len(b) + 1))
    for u in range(len(a)):
        for v in range(len(b)):
            t = abs(u - v)
            if u > v:
                m_uv[u + 1, v + 1] = np.sum([a[i+t]*b[i] for i in range(v+1)])
            else:
                m_uv[u + 1, v + 1] = np.sum([a[i]*b[i+t] for i in range(u+1)])
    print('\nManual method takes', time.time() - start, 'seconds')

    # Let's calculate the arrays with our method
    start = time.time()
    s_x_2, s_x_sqr_2, s_y_2, s_y_sqr_2, m_uv_2 = util.calculate_stats(a, b)
    print('\nMethodology from original CPP code took', time.time() - start, 'seconds')

    start = time.time()
    m_uv_3 = np.zeros((len(a) + 1, len(b) + 1))
    for u in range(len(a)):
        for v in range(len(b)):
            t = abs(u-v)
            if u > v:
                m_uv_3[u+1, v+1] = m_uv_3[u, v] + a[v+t]*b[v]
            else:
                m_uv_3[u+1, v+1] = m_uv_3[u, v] + a[u]*b[u+t]
    print('\nMethodology with dynamic programming took', time.time() - start, 'seconds')


    # Are they 'almost' equal (up to 7 decimals)
    np.testing.assert_almost_equal(s_x, s_x_2)
    np.testing.assert_almost_equal(s_x_sqr, s_x_sqr_2)
    np.testing.assert_almost_equal(s_y, s_y_2)
    np.testing.assert_almost_equal(s_y_sqr, s_y_sqr_2)
    np.testing.assert_almost_equal(m_uv, m_uv_2)
    np.testing.assert_almost_equal(m_uv_2, m_uv_3)