def test_shapelets(): pytest.importorskip('keras') from tslearn.shapelets import ShapeletModel n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) clf.fit(time_series, y) np.testing.assert_allclose(clf.shapelets_[0], np.array([[0.56373, 0.494684], [1.235707, 1.119235]]), atol=1e-2) np.testing.assert_allclose( clf.predict(time_series), np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])) cross_validate(clf, time_series, y, cv=2) model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1) model.fit(time_series, y) for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_): np.testing.assert_allclose(shp, to_time_series(shp_bis, remove_nans=True))
def test_shapelets(): n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) clf.fit(time_series, y) np.testing.assert_allclose(clf.shapelets_[0], np.array([[0.56373, 0.494684], [1.235707, 1.119235]]), atol=1e-2) np.testing.assert_allclose( clf.predict(time_series), np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])) from sklearn.model_selection import cross_validate cross_validate(clf, time_series, y, cv=2)
# Set the number of shapelets per size as done in the original paper shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=0.1, r=2) # Define the model using parameters provided by the authors (except that we use # fewer iterations here) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=200, verbose=0) shp_clf.fit(X_train, y_train) predicted_labels = shp_clf.predict(X_test) print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) plt.figure() for i, sz in enumerate(shapelet_sizes.keys()): plt.subplot(len(shapelet_sizes), 1, i + 1) plt.title("%d shapelets of size %d" % (shapelet_sizes[sz], sz)) for shp in shp_clf.shapelets_: if ts_size(shp) == sz: plt.plot(shp.ravel()) plt.xlim([0, max(shapelet_sizes.keys()) - 1]) plt.tight_layout() plt.show() # The loss history is accessible via the `model` attribute that is a keras
def classify_with_shapelets(): from keras.optimizers import Adagrad from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict feat_path = sys.argv[1] label_type = sys.argv[2] results_base_path = sys.argv[3] data_path = sys.argv[4] pitch_path = os.path.join(data_path, 'pitch.txt') energy_path = os.path.join(data_path, 'energy.txt') #Raw pitch and energy raw_pitch, raw_energy = read_pitch_energy(pitch_path, energy_path) # Tunable Parameters = shapelet length, threshold, shapelet redundancy value # sweep shapelet length pitch_shapelet = {} energy_shapelet = {} for shapelet_len in [10, 25, 50]: for spkr in raw_pitch: # Compute shapelets from raw frames (i.e. no segmented info like where phone/word is) pitch_shapelet[spkr] = compute_shapelet_frame( raw_pitch[spkr], shapelet_len, True) # energy_shapelet[spkr] = compute_shapelet_frame(raw_energy[spkr], shapelet_len) # pitch_shapelet[spkr] = np.array(raw_pitch[spkr]) # print(len(raw_pitch[spkr])) # exit() acc = [] for sim in range(10): y_true = [] y_pred = [] for spkr in tqdm(late_balanced.keys()): test_spkr = [spkr] train_spkrs = late_balanced.keys() train_spkrs.remove(test_spkr[0]) X_train = np.array([ np.array(shapelet).reshape(shapelet_len, 1) for x in train_spkrs for shapelet in pitch_shapelet[x] ]) y_train = np.array([ late_balanced[x] for x in train_spkrs for shapelet in pitch_shapelet[x] ]) X_test = np.array([ np.array(shapelet).reshape(shapelet_len, 1) for x in test_spkr for shapelet in pitch_shapelet[x] ]) y_test = np.array([ late_balanced[x] for x in test_spkr for shapelet in pitch_shapelet[x] ]) # print('train data', X_train.shape) # #print('train data first', X_train[0]) # print('train label', y_train.shape) # exit() shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=X_train.shape[0], ts_sz=X_train.shape[1], n_classes=len(set(y_train)), l=0.1, r=2) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose_level=0) shp_clf.fit(X_train, y_train) predicted_locations = shp_clf.locate(X_test) print('predicted_locations.shape', predicted_locations.shape) # test_ts_id = 0 # plt.figure() # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) # plt.plot(X_test[test_ts_id].ravel()) # for idx_shp, shp in enumerate(shp_clf.shapelets_): # t0 = predicted_locations[test_ts_id, idx_shp] # plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2) # plt.tight_layout() # plt.savefig(test_ts_id+'_test.png', format='png') # exit() prediction = shp_clf.predict(X_test) prediction_prob = shp_clf.predict_proba(X_test) y_pred += prediction.tolist() y_true += y_test.tolist() ###After LOO # test_ts_id = 0 # plt.figure() # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) # plt.plot(X_test[test_ts_id].ravel()) # for idx_shp, shp in enumerate(shp_clf.shapelets_): # t0 = predicted_locations[test_ts_id, idx_shp] # plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2) # plt.tight_layout() # plt.savefig('test.png', format='png') local_acc = balanced_accuracy_score(y_true, y_pred) acc.append(local_acc) # print('acc', acc) # print('final acc', np.mean(acc)) # print('final acc std', np.std(acc)) if not os.path.exists(os.path.join(results_base_path, 'regression')): os.makedirs(os.path.join(results_base_path, 'regression')) results_file = os.path.join(results_base_path, 'regression', 'shapelet_' + str(len_feats) + '.txt') with open(results_file, 'w') as w: # w.write("Confusion Matrix\n") # # w.write(confusion_matrix(y_true, y_pred).tolist()) # w.write('{}\n\n'.format(confusion_matrix(y_true, y_pred))) w.write('regression: {} ({})\n'.format(np.mean(acc_list), np.std(acc_list))) w.write('baseline: {} ({})'.format(np.mean(acc_baseline), np.std(acc_baseline))) w.write("\nFeature Importance\n") for i in tot_importance: tot_importance[i] = np.mean(tot_importance[i]) for i in sorted(tot_importance.items(), key=operator.itemgetter(1), reverse=True): w.write("{} = {}\n".format(i[0], i[1]))
class Shapelets(): def __init__(self, epochs, length, num_shapelet_lengths, num_shapelets, learning_rate, weight_regularizer, batch_size=256, optimizer=Adam): ''' initialize shapelet hyperparameters hyperparameters: epochs : number of training epochs length : base shapelet length, expressed as fraction of length of time series num_shapelet_lengths : number of different shapelet lengths num_shapelets : number of unique shapelets to learn at each shapelet length, expressed as fraction of length of time series learning rate : learning rate of Keras optimizer weight regularizer : weight regularization used when fitting model ''' self.epochs = epochs self.length = length self.num_shapelet_lengths = num_shapelet_lengths self.num_shapelets = num_shapelets self.weight_regularizer = weight_regularizer self.batch_size = batch_size self.optimizer = optimizer(lr=learning_rate) self.shapelet_sizes = None self.shapelet_clf = None self.encoder = LabelEncoder() def clear_session(self): try: assert (self.shapelet_clf is not None) except: raise ValueError( "Cannot clear session that has not been initialized") self.shapelet_clf.clear_session() return def load_model(self, series_length, labels, checkpoint): ''' Load model from checkpoint into Shapelet classifier ''' if self.shapelet_clf is None: base_size = int(self.length * series_length) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * series_length) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # first generate new model into which to load the weights self.encode(labels) self.shapelet_clf.generate_model(series_length, len(self.get_classes())) # load weights self.shapelet_clf.model.load_weights(checkpoint) def fit_transfer_model(self, X_train, y_train, checkpoint, nclasses_prior=2, source_dir=None, val_data=None): # encode training and validation labels y_train = self.encode(y_train) y_val = self.encode(val_data[1]) # scale training and validation data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) X_val_scaled = self.__ScaleData(val_data[0]) if self.shapelet_clf is None: base_size = int(self.length * X_train.shape[1]) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * X_train.shape[1]) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # fit shapelet classifier self.shapelet_clf.fit_transfer_model(X_train_scaled, y_train, nclasses_prior, checkpoint, source_dir, (X_val_scaled, y_val)) def fit(self, X_train, y_train, source_dir=None, val_data=None): ''' fit shapelet classifier on training data parameters: X_train : training time series y_train : training labels ''' if self.shapelet_clf is None: base_size = int(self.length * X_train.shape[1]) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * X_train.shape[1]) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # encode training and validation labels y_train = self.encode(y_train) y_val = self.encode(val_data[1]) # scale training and validation data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) X_val_scaled = self.__ScaleData(val_data[0]) # fit classifier self.shapelet_clf.fit(X_train_scaled, y_train, source_dir, (X_val_scaled, y_val)) def __ScaleData(self, input_data): ''' scale input data to range [0,1] parameters: input_data : input data to rescale ''' return TimeSeriesScalerMinMax().fit_transform(input_data) def predict(self, X_test): ''' classifications for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' X_test_scaled = self.__ScaleData(X_test) return self.shapelet_clf.predict(X_test_scaled) def predict_proba(self, X_test): ''' class probabilities for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' X_test_scaled = self.__ScaleData(X_test) return self.shapelet_clf.predict_proba(X_test_scaled) def encode(self, categories): ''' fit label encoder on input categories. returns transformed categories ''' self.encoder.fit(categories) return self.encoder.transform(categories) def decode(self, y_probs, p_threshold): ''' decode prediction probabilities y_probs into prediction / confidence give p_threshold ''' prob_max = np.amax(y_probs, axis=1) prediction_indices = prob_max > p_threshold y_pred = np.zeros(y_probs.shape[0]) # reintepret confidence in binary case if y_probs.shape[1] == 1: y_pred[prediction_indices] = 1 confidence = (prob_max - p_threshold) / (y_pred - p_threshold) confidence = 0.5 + confidence / 2 else: y_pred[prediction_indices] = np.argmax(y_probs, axis=1)[prediction_indices] confidence = prob_max y_pred = y_pred.astype(int) y_preds = self.encoder.inverse_transform(y_pred) return y_preds, confidence def get_classes(self): ''' get original classes from encoder ''' try: assert (self.encoder is not None) except: raise ValueError("Encoder has not been initialized") return self.encoder.classes_ def VisualizeShapelets(self): ''' visualize all of shapelets learned by shapelet classifier ''' plt.figure() for i, sz in enumerate(self.shapelet_sizes.keys()): plt.subplot(len(self.shapelet_sizes), 1, i + 1) plt.title("%d shapelets of size %d" % (self.shapelet_sizes[sz], sz)) for shapelet in self.shapelet_clf.shapelets_: if ts_size(shapelet) == sz: plt.plot(shapelet.ravel()) plt.xlim([0, max(self.shapelet_sizes.keys())]) plt.tight_layout() plt.show() def VisualizeShapeletLocations(self, series_values, series_id, save_dir='visualizations', name='shp_1'): ''' visualize shapelets superimposed on one of the test series parameters: series_values: raw values on which to visualize shapelets series_id: id of time series to visualize save_dir directory in which to save visualizations name name under which to save viz (bc unique every time) n_shapelets: ''' plt.style.use("seaborn-whitegrid") # NK brand colors COLORS = [ "#FA5655", "#F79690", "#B9BC2D", "#86B6B2", "#955B99", "#252B7A" ] # others? "#8D6B2C", # "#D0A826", # "#FEDB03", # "#000000", # "#454545", # "#FFFFFF", # "#F8F6F1"] n_rows, n_cols, _ = series_values.shape test_series = series_values[series_id].reshape(-1, ) closest_inds = self.shapelet_clf.locate(test_series.reshape(1, -1, 1))[0] closest_dists = [] for ind, shp in zip(closest_inds, self.shapelet_clf.shapelets_): closest_dists.append( np.linalg.norm(test_series[ind:ind + shp.shape[0]] - shp)) closest_dists = np.array(closest_dists) # convert distance to weight where dist=0 -> wt=1 and dist=inf -> wt=0 sl_weights = 1 / (1 + closest_dists) # plot the signal with matching shapelets color overlayed plt.clf() plt.plot(range(n_cols), test_series, color="k") for ind, sl, wt, color in zip(closest_inds, self.shapelet_clf.shapelets_, sl_weights, COLORS): # find closest match t = range(ind, ind + sl.shape[0]) match = test_series[ind:ind + sl.shape[0]] # plot shapelet on top of signal width width and alpha set by shapelet weight plt.plot(t, match, alpha=7 * wt, linewidth=35 * wt, color=color) plt.ylabel('Email Density') plt.xlabel('Minute of the Hour') plt.show() #plt.savefig(save_dir + "/{}_signal_size_{}_id_{}.png".format(name, n_cols, series_id)) # plot shapelets plt.clf() # to plot the shapelets, switch to dark background plt.style.use("seaborn-darkgrid") # ax = plt.axes() # used below for sharex, sharey (if needed?) # arange shapletes in grid - find greatest factor of n_shapelets gf = 0 shp_t = self.shapelet_clf.shapelets_as_time_series_ shp = self.shapelet_clf.shapelets_ for i in range(1, shp.shape[0]): if shp.shape[0] % i == 0: gf = i of = int(shp.shape[0] / gf) n_cols = 2 for i in range(shp_t.shape[0]): ax_i = plt.subplot(gf, of, i + 1) # we could force them to share the same axes # ax_i = plt.subplot(n_rows, n_cols, i + 1, sharex=ax, sharey=ax) #ax_i.set_xticklabels([]) ax_i.set_yticklabels([]) plt.plot(range(shp_t.shape[1]), shp[i].reshape(-1), color=COLORS[i % len(COLORS)], linewidth=3) plt.xlabel('Shapelet Length') plt.show()