Ejemplo n.º 1
0
 def cost(shapelet):
     L = []
     for k in range(len(timeseries)):
         D = timeseries[k, :]
         dist = other_util.sdist(shapelet, D)
         L.append((dist, labels[k]))
     return metric(L)
Ejemplo n.º 2
0
    def extract(self,
                timeseries,
                labels,
                min_len=None,
                max_len=None,
                nr_shapelets=1,
                metric=other_util.calculate_ig):
        if min_len is None:
            min_len = 4
        if max_len is None:
            max_len = timeseries.shape[1]

        shapelets = []
        for j in trange(len(timeseries), desc='timeseries', position=0):
            # We will extract shapelet candidates from S
            S = timeseries[j, :]
            for l in range(min_len, max_len):
                for i in range(len(S) - l + 1):
                    candidate = S[i:i + l]
                    # Compute distances to all other timeseries
                    L = []  # The orderline, to calculate entropy, only for IG
                    for k in range(len(timeseries)):
                        D = timeseries[k, :]
                        dist = other_util.sdist(candidate, D)
                        L.append((dist, labels[k]))
                    score = metric(L)
                    shapelets.append((list(candidate), list(score), [j, i, l]))

        shapelets = sorted(shapelets, key=lambda x: x[1:], reverse=True)
        best_shapelets = [x[0] for x in shapelets[:nr_shapelets]]
        return best_shapelets
Ejemplo n.º 3
0
    def extract(self,
                timeseries,
                labels,
                min_len=None,
                max_len=None,
                nr_shapelets=1,
                metric=other_util.calculate_ig):
        if min_len is None:
            min_len = 4
        if max_len is None:
            max_len = timeseries.shape[1]

        shapelets = []
        for j in trange(len(timeseries), desc='timeseries', position=0):
            S = timeseries[j, :]
            stats = {}
            # Pre-compute all metric arrays, which will allow us to
            # calculate the distance between two timeseries in constant time
            for k in range(len(timeseries)):
                metrics = other_util.calculate_metric_arrays(
                    S, timeseries[k, :])
                stats[(j, k)] = metrics

            for l in range(min_len, max_len):
                # Keep a history to calculate an upper bound, this could
                # result in pruning,LRUCache thus avoiding the construction of
                # the orderline L (which is an expensive operation)
                H = LRUCache(size=self.cache_size)
                for i in range(len(S) - l + 1):
                    if self.pruning:
                        # Check if we can prune
                        prune = False
                        for w in range(len(H.values)):
                            L_prime, S_prime = H.values[w]
                            R = other_util.sdist(S[i:i + l], S_prime)
                            if other_util.upper_ig(L_prime.copy(),
                                                   R) < max_gain:
                                prune = True
                                break
                        if prune: continue

                    # Extract a shapelet from S, start at index i with length l
                    L = []  # An orderline with distances to shapelet & labels
                    for k in range(len(timeseries)):
                        S_x, S_x2, S_y, S_y2, M = stats[(j, k)]
                        L.append(
                            (other_util.sdist_metrics(i, l, S_x, S_x2, S_y,
                                                      S_y2, M), labels[k]))
                    score = metric(L)
                    shapelets.append(
                        ([list(S[i:i + l])] + list(score) + [j, i, l]))

                    if self.pruning:
                        H.put((L, S[i:i + l]))

        shapelets = sorted(shapelets, key=lambda x: x[1:], reverse=True)
        best_shapelets = [x[0] for x in shapelets[:nr_shapelets]]
        return best_shapelets
Ejemplo n.º 4
0
    def extract(self, timeseries, labels, min_len=None, max_len=None, 
                nr_shapelets=1, metric=other_util.calculate_ig):
        if min_len is None:
            min_len = sax_length
        if max_len is None:
            max_len = timeseries.shape[1]

        unique_classes = set(labels)
        classes_cntr = Counter(labels)

        shapelets = []
        for l in trange(min_len, max_len, desc='length', position=0):
            # To select the candidates, all subsequences of length l from   
            # all time series are created using the sliding window technique, 
            # and we create their corresponding SAX word and keep them in SAXList 
            sax_words = np.zeros((
                len(timeseries), 
                timeseries.shape[1] - l + 1,
                self.sax_length
            ))
            for ts_idx, ts in enumerate(timeseries):
                # Extract all possible subseries, by using a sliding window
                # with shift=1
                subseries = []
                for k in range(len(ts) - l + 1):
                    subseries.append(other_util.z_norm(ts[k:k+l]))
                # Transform all the subseries and add them to the sax_words
                transformed_timeseries = transform(subseries, self.sax_length, 
                                                   self.alphabet_size)
                sax_words[ts_idx] = transformed_timeseries
            
            score_table = self._create_score_table(sax_words, labels, 
                                                   iterations=self.iterations,
                                                   mask_size=self.mask_size)
            max_score_table = np.ones_like(score_table)
            for c in unique_classes:
                max_score_table[:, :, c] = classes_cntr[c] * self.iterations
            rev_score_table = max_score_table - score_table

            # TODO: Can we replace this simple power calculation by a more
            # powerful metric to heuristically measure the quality
            power = []
            for ts_idx in range(score_table.shape[0]):
                for sax_idx in range(score_table.shape[1]):
                    min_val, max_val = float('inf'), float('-inf')
                    total = 0
                    for class_idx in range(score_table.shape[2]):
                        score = score_table[ts_idx, sax_idx, class_idx]
                        rev_score = rev_score_table[ts_idx, sax_idx, class_idx]
                        diff = score - rev_score
                        if diff > max_val:
                            max_val = diff
                        if diff < min_val:
                            min_val = diff
                        total += abs(diff)

                    v = (total-abs(max_val)-abs(min_val)) + abs(max_val-min_val)
                    power.append((v, (ts_idx, sax_idx)))
            
            top_candidates = sorted(power, key=lambda x: -x[0])[:self.nr_candidates]
            for score, (ts_idx, sax_idx) in top_candidates:
                candidate = timeseries[ts_idx][sax_idx:sax_idx+l]
                L = []  # The orderline, to calculate entropy
                for k in range(len(timeseries)):
                    D = timeseries[k, :]
                    dist = other_util.sdist(candidate, D)
                    L.append((dist, labels[k]))
                score = metric(L)
                shapelets.append(([list(candidate)] + list(score) + [ts_idx, sax_idx, l]))

        shapelets = sorted(shapelets, key=lambda x: x[1:], reverse=True)
        best_shapelets = [x[0] for x in shapelets[:nr_shapelets]]
        return best_shapelets