Ejemplo n.º 1
0
def fit(X, y, max_depth=None, min_samples_split=2, min_len=10, max_len=10):
    print('New node')
    if (max_depth is None or max_depth > 0) and len(X) > min_samples_split and len(np.unique(y)) > 1:
        # TODO: pass the distance along with this shapelet so we don't need to recalculate this!
        shapelet = fast_shapelet_discovery(X, y,min_len=min_len,max_len=max_len)
        distance = check_candidate(X, y, shapelet)[1]
        node = ShapeletTree(right=None, left=None, shapelet=shapelet, distance=distance,
                            class_probabilities=Counter(y))
        X_left, y_left, X_right, y_right = [], [], [], []
        for ts, label in zip(X, y):
            if subsequence_dist(ts, shapelet)[0] <= distance:
                X_left.append(ts)
                y_left.append(label)
            else:
                X_right.append(ts)
                y_right.append(label)

        new_depth = None if max_depth is None else max_depth - 1
        node.left = fit(X_left, y_left, max_depth=new_depth, min_samples_split=min_samples_split,
                        min_len=min_len, max_len=max_len)
        node.right = fit(X_right, y_right, max_depth=new_depth, min_samples_split=min_samples_split,
                         min_len=min_len, max_len=max_len)
        return node
    else:
        return ShapeletTree(right=None, left=None, shapelet=None, distance=None,
                            class_probabilities=Counter(y))
Ejemplo n.º 2
0
    def increment_class_probs(self, ts, label):
        if label not in self.class_probabilities:
            self.class_probabilities[label] = 1
        else:
            self.class_probabilities[label] += 1

        if self.distance is not None:

            dist, idx = subsequence_dist(ts, self.shapelet)
            if dist <= self.distance:
                self.left.increment_class_probs(ts, label)
            else:
                self.right.increment_class_probs(ts, label)
Ejemplo n.º 3
0
 def evaluate(self, time_serie, proba=True):
     if self.distance is None:
         if proba:
             return self.class_probabilities
         else:
             return max(self.class_probabilities.items(),
                        key=operator.itemgetter(1))[0]
     else:
         dist, idx = subsequence_dist(time_serie, self.shapelet)
         if dist <= self.distance:
             return self.left.evaluate(time_serie, proba=proba)
         else:
             return self.right.evaluate(time_serie, proba=proba)
Ejemplo n.º 4
0
def check_candidate(timeseries,
                    labels,
                    shapelet,
                    min_prune_length=20,
                    best_ig=None):
    distances = []
    # cntr = Counter(labels)
    for time_serie, label in zip(timeseries, labels):
        d, idx = util.subsequence_dist(time_serie, shapelet)
        distances.append((d, label))
        # max_ig = None
        # if best_ig is not None:
        #     cntr[label] -= 1
        #     if len(distances) > min_prune_length:
        #         max_ig = entropy_pre_prune(cntr, distances)
        # if max_ig is not None and max_ig <= best_ig:
        #     return 0, 0

    return find_best_split_point(sorted(distances, key=lambda x: x[0]))
Ejemplo n.º 5
0
    def recalculate_distances(self, timeseries, labels):
        if self.distance is not None:
            ig, dist = check_candidate(timeseries, labels, self.shapelet)
            print(dist, self.distance)
            self.distance = dist
            ts_left, labels_left = [], []
            ts_right, labels_right = [], []
            for (ts, label) in zip(timeseries, labels):
                dist, idx = subsequence_dist(ts, self.shapelet)
                if dist < self.distance:
                    ts_left.append(ts)
                    labels_left.append(label)
                else:
                    ts_right.append(ts)
                    labels_right.append(label)

            print(labels, 'are split into', labels_left, 'and', labels_right)

            self.left.recalculate_distances(ts_left, labels_left)
            self.right.recalculate_distances(ts_right, labels_right)

        else:
            print('leaf:', labels)
Ejemplo n.º 6
0
# assert np.array_equal(m_uv, m_uv_old)

print(labels)
print(timeseries)

if __name__ == "__main__":
    print('Fitting tree')
    tree = extract_shapelet(timeseries, labels)

    print(tree.shapelet)
    print(tree.distance)

    distances = []

    for ts, label in zip(timeseries, labels):
        d, idx = subsequence_dist(ts, tree.shapelet)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    distances = []

    for ts, label in zip(timeseries, labels):
        stats = calculate_stats(tree.shapelet, ts)
        d = sdist_new(tree.shapelet, ts, 0, stats)
        distances.append((d, label))

    print([x for x in sorted(distances, key=lambda x: x[0])])

    distances = []
                stats[tuple(ts2)] = util.calculate_stats(ts, ts2)
            sdist_new_overhead.append(time.time() - start_time)

            for l in range(1, ts_length + 1):
                for start in range(len(ts) - l):  # Possible start positions
                    new_dists = []
                    old_dists = []
                    for k, (ts2, label2) in enumerate(zip(timeseries, labels)):
                        start_time = time.time()
                        dist_new = util.sdist_new(ts[start:start + l], ts2,
                                                  start, stats[tuple(ts2)])
                        sdist_new_times.append(time.time() - start_time)
                        new_dists.append((k, dist_new))

                        start_time = time.time()
                        dist_old, idx_old = util.subsequence_dist(
                            ts2, ts[start:start + l])
                        sdist_old_times.append(time.time() - start_time)
                        old_dists.append((k, dist_old))

                    new_dists = sorted(new_dists, key=lambda x: x[1])
                    old_dists = sorted(old_dists, key=lambda x: x[1])
                    print(new_dists)
                    print(old_dists)
                    np.testing.assert_equal([x[0] for x in new_dists][0],
                                            [x[0] for x in old_dists][0])

        print('New distance calculation took:',
              np.sum(sdist_new_times) + np.sum(sdist_new_overhead))
        print('New distance (overhead):', np.sum(sdist_new_overhead))
        print('Old distance calculation took:', np.sum(sdist_old_times))