def cost(shapelet): L = [] for k in range(len(timeseries)): D = timeseries[k, :] dist = other_util.sdist(shapelet, D) L.append((dist, labels[k])) return metric(L)
def extract(self, timeseries, labels, min_len=None, max_len=None, nr_shapelets=1, metric=other_util.calculate_ig): if min_len is None: min_len = 4 if max_len is None: max_len = timeseries.shape[1] shapelets = [] for j in trange(len(timeseries), desc='timeseries', position=0): # We will extract shapelet candidates from S S = timeseries[j, :] for l in range(min_len, max_len): for i in range(len(S) - l + 1): candidate = S[i:i + l] # Compute distances to all other timeseries L = [] # The orderline, to calculate entropy, only for IG for k in range(len(timeseries)): D = timeseries[k, :] dist = other_util.sdist(candidate, D) L.append((dist, labels[k])) score = metric(L) shapelets.append((list(candidate), list(score), [j, i, l])) shapelets = sorted(shapelets, key=lambda x: x[1:], reverse=True) best_shapelets = [x[0] for x in shapelets[:nr_shapelets]] return best_shapelets
def extract(self, timeseries, labels, min_len=None, max_len=None, nr_shapelets=1, metric=other_util.calculate_ig): if min_len is None: min_len = 4 if max_len is None: max_len = timeseries.shape[1] shapelets = [] for j in trange(len(timeseries), desc='timeseries', position=0): S = timeseries[j, :] stats = {} # Pre-compute all metric arrays, which will allow us to # calculate the distance between two timeseries in constant time for k in range(len(timeseries)): metrics = other_util.calculate_metric_arrays( S, timeseries[k, :]) stats[(j, k)] = metrics for l in range(min_len, max_len): # Keep a history to calculate an upper bound, this could # result in pruning,LRUCache thus avoiding the construction of # the orderline L (which is an expensive operation) H = LRUCache(size=self.cache_size) for i in range(len(S) - l + 1): if self.pruning: # Check if we can prune prune = False for w in range(len(H.values)): L_prime, S_prime = H.values[w] R = other_util.sdist(S[i:i + l], S_prime) if other_util.upper_ig(L_prime.copy(), R) < max_gain: prune = True break if prune: continue # Extract a shapelet from S, start at index i with length l L = [] # An orderline with distances to shapelet & labels for k in range(len(timeseries)): S_x, S_x2, S_y, S_y2, M = stats[(j, k)] L.append( (other_util.sdist_metrics(i, l, S_x, S_x2, S_y, S_y2, M), labels[k])) score = metric(L) shapelets.append( ([list(S[i:i + l])] + list(score) + [j, i, l])) if self.pruning: H.put((L, S[i:i + l])) shapelets = sorted(shapelets, key=lambda x: x[1:], reverse=True) best_shapelets = [x[0] for x in shapelets[:nr_shapelets]] return best_shapelets
def extract(self, timeseries, labels, min_len=None, max_len=None, nr_shapelets=1, metric=other_util.calculate_ig): if min_len is None: min_len = sax_length if max_len is None: max_len = timeseries.shape[1] unique_classes = set(labels) classes_cntr = Counter(labels) shapelets = [] for l in trange(min_len, max_len, desc='length', position=0): # To select the candidates, all subsequences of length l from # all time series are created using the sliding window technique, # and we create their corresponding SAX word and keep them in SAXList sax_words = np.zeros(( len(timeseries), timeseries.shape[1] - l + 1, self.sax_length )) for ts_idx, ts in enumerate(timeseries): # Extract all possible subseries, by using a sliding window # with shift=1 subseries = [] for k in range(len(ts) - l + 1): subseries.append(other_util.z_norm(ts[k:k+l])) # Transform all the subseries and add them to the sax_words transformed_timeseries = transform(subseries, self.sax_length, self.alphabet_size) sax_words[ts_idx] = transformed_timeseries score_table = self._create_score_table(sax_words, labels, iterations=self.iterations, mask_size=self.mask_size) max_score_table = np.ones_like(score_table) for c in unique_classes: max_score_table[:, :, c] = classes_cntr[c] * self.iterations rev_score_table = max_score_table - score_table # TODO: Can we replace this simple power calculation by a more # powerful metric to heuristically measure the quality power = [] for ts_idx in range(score_table.shape[0]): for sax_idx in range(score_table.shape[1]): min_val, max_val = float('inf'), float('-inf') total = 0 for class_idx in range(score_table.shape[2]): score = score_table[ts_idx, sax_idx, class_idx] rev_score = rev_score_table[ts_idx, sax_idx, class_idx] diff = score - rev_score if diff > max_val: max_val = diff if diff < min_val: min_val = diff total += abs(diff) v = (total-abs(max_val)-abs(min_val)) + abs(max_val-min_val) power.append((v, (ts_idx, sax_idx))) top_candidates = sorted(power, key=lambda x: -x[0])[:self.nr_candidates] for score, (ts_idx, sax_idx) in top_candidates: candidate = timeseries[ts_idx][sax_idx:sax_idx+l] L = [] # The orderline, to calculate entropy for k in range(len(timeseries)): D = timeseries[k, :] dist = other_util.sdist(candidate, D) L.append((dist, labels[k])) score = metric(L) shapelets.append(([list(candidate)] + list(score) + [ts_idx, sax_idx, l])) shapelets = sorted(shapelets, key=lambda x: x[1:], reverse=True) best_shapelets = [x[0] for x in shapelets[:nr_shapelets]] return best_shapelets