def fit(self, x, y=None): clf = ShapeletModel( n_shapelets_per_size={self.input_span: self.state_num}, weight_regularizer=.01, verbose_level=0) clf.fit(x, y) joblib.dump(clf, self.modelpath / 'states.m')
def cluster(timeseries_df,data_labels): Shapelet_list = [] D = '/home/abhilash/Datasets/UCRArchive_2018/TwoLeadECG/TXT_Files/' for i in range(1,max(data_labels)+1): print('Class',i) ts_df=timeseries_df[timeseries_df['0']==i] ts_df=ts_df.reset_index(drop=True) labels = ts_df['0'] ts_df = ts_df.drop(ts_df.columns[0], axis=1) S='class'+str(i)+'labels.txt' pred_label=pd.read_csv(D+S,header=None) pred_label=np.ravel(np.array(pred_label)) shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=ts_df.shape[0], ts_sz=ts_df.shape[1], n_classes=2, l=0.36, r=1) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose=0) shp_clf.fit(ts_df, pred_label) shapelets=shp_clf.shapelets_; temp_list=[] for i in range(0, shapelets.shape[0]): temp= shapelets[i].T temp_list.append(temp[0]) Shapelet_list.append(temp_list) return Shapelet_list
def cluster(shape,timeseries_df,data_labels,k): Shapelet_list = [] for i in range(1,max(data_labels)+1): ts_df=timeseries_df[timeseries_df['0']==i] ts_df=ts_df.reset_index(drop=True) # cluster_list.append(extractU_Shapelets(shape) labels = ts_df['0'] ts_df = ts_df.drop(ts_df.columns[0], axis=1) S=uShapeletClustering.extractU_Shapelets.extract_Shapelets(ts_df.copy(),shape,k) S = np.array(list(S)) pred_label=uShapeletClustering.Kmeans.Kmeans(ts_df.copy(),S,k,labels) shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=ts_df.shape[0], ts_sz=ts_df.shape[1], n_classes=2, l=0.5, r=1) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose=0) shp_clf.fit(ts_df, pred_label) shapelets=shp_clf.shapelets_; temp_list=[] for i in range(0, shapelets.shape[0]): temp= shapelets[i].T temp_list.append(temp[0]) Shapelet_list.append(temp_list) return Shapelet_list
def load_model(self, series_length, labels, checkpoint): ''' Load model from checkpoint into Shapelet classifier ''' if self.shapelet_clf is None: base_size = int(self.length * series_length) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * series_length) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # first generate new model into which to load the weights self.encode(labels) self.shapelet_clf.generate_model(series_length, len(self.get_classes())) # load weights self.shapelet_clf.model.load_weights(checkpoint)
def fit_transfer_model(self, X_train, y_train, checkpoint, nclasses_prior=2, source_dir=None, val_data=None): # encode training and validation labels y_train = self.encode(y_train) y_val = self.encode(val_data[1]) # scale training and validation data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) X_val_scaled = self.__ScaleData(val_data[0]) if self.shapelet_clf is None: base_size = int(self.length * X_train.shape[1]) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * X_train.shape[1]) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # fit shapelet classifier self.shapelet_clf.fit_transfer_model(X_train_scaled, y_train, nclasses_prior, checkpoint, source_dir, (X_val_scaled, y_val))
def test_shapelets(): pytest.importorskip('keras') from tslearn.shapelets import ShapeletModel n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) clf.fit(time_series, y) np.testing.assert_allclose(clf.shapelets_[0], np.array([[0.56373, 0.494684], [1.235707, 1.119235]]), atol=1e-2) np.testing.assert_allclose( clf.predict(time_series), np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])) cross_validate(clf, time_series, y, cv=2) model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1) model.fit(time_series, y) for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_): np.testing.assert_allclose(shp, to_time_series(shp_bis, remove_nans=True))
def learningShapeletClassifier(X_train, Y_train): shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=X_train.shape[0], ts_sz=X_train.shape[1], n_classes=len(set(Y_train)), l=0.1, r=2) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=200, verbose_level=0) shp_clf.fit(X_train, Y_train) return shp_clf
def test_serialize_shapelets(): def get_model_weights(model): return model.model_.get_weights() n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) X = rng.randn(n, sz, d) for y in [rng.randint(low=0, high=3, size=n), rng.choice(["one", "two", "three"], size=n)]: shp = ShapeletModel(max_iter=1) _check_not_fitted(shp) shp.fit(X, y) _check_params_predict(shp, X, ['predict'], check_params_fun=get_model_weights, formats=["json", "pickle"])
def lts_discovery(X_train, y_train, X_test, y_test, nr_shap, l, r, reg, max_it, shap_out_path, pred_out_path, timing_out_path): # Fit LTS model, print metrics on test-set, write away predictions and shapelets shapelet_dict = grabocka_params_to_shapelet_size_dict( X_train.shape[0], X_train.shape[1], int(nr_shap * X_train.shape[1]), l, r) clf = ShapeletModel(n_shapelets_per_size=shapelet_dict, max_iter=max_it, verbose_level=1, batch_size=1, optimizer='sgd', weight_regularizer=reg) start = time.time() clf.fit(np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)), y_train) learning_time = time.time() - start print([len(x) for x in clf.shapelets_]) print(clf.get_weights()) print('Learning shapelets took {}s'.format(learning_time)) with open(shap_out_path, 'w+') as ofp: for shap in clf.shapelets_: ofp.write(str(np.reshape(shap, (-1))) + '\n') with open(timing_out_path, 'w+') as ofp: ofp.write(str(learning_time)) X_distances_train = clf.transform(X_train) X_distances_test = clf.transform(X_test) fit_lr(X_distances_train, y_train, X_distances_test, y_test, pred_out_path)
def test_shapelets(): n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) clf.fit(time_series, y) np.testing.assert_allclose(clf.shapelets_[0], np.array([[0.56373, 0.494684], [1.235707, 1.119235]]), atol=1e-2) np.testing.assert_allclose( clf.predict(time_series), np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])) from sklearn.model_selection import cross_validate cross_validate(clf, time_series, y, cv=2)
def fit_lts(X_train, y_train, X_test, y_test, shap_dict, reg, max_it, shap_out_path, pred_out_path, timing_out_path): # Fit LTS model, print metrics on test-set, write away predictions and shapelets clf = ShapeletModel(n_shapelets_per_size=shap_dict, max_iter=max_it, verbose_level=0, batch_size=1, optimizer='sgd', weight_regularizer=reg) start = time.time() clf.fit( np.reshape( X_train, (X_train.shape[0], X_train.shape[1], 1) ), y_train ) learning_time = time.time() - start with open(shap_out_path, 'w+') as ofp: for shap in clf.shapelets_: ofp.write(str(np.reshape(shap, (-1))) + '\n') with open(timing_out_path, 'w+') as ofp: ofp.write(str(learning_time)) X_distances_train = clf.transform(X_train) X_distances_test = clf.transform(X_test) fit_lr(X_distances_train, y_train, X_distances_test, y_test, pred_out_path)
def test_series_lengths(): pytest.importorskip('tensorflow') from tslearn.shapelets import ShapeletModel # Test long shapelets y = [0, 1] time_series = to_time_series_dataset([[1, 2, 3, 4, 5], [3, 2, 1]]) clf = ShapeletModel(n_shapelets_per_size={8: 1}, max_iter=1, verbose=0, random_state=0) np.testing.assert_raises(ValueError, clf.fit, time_series, y) # Test small max_size y = [0, 1] time_series = to_time_series_dataset([[1, 2, 3, 4, 5], [3, 2, 1]]) clf = ShapeletModel(n_shapelets_per_size={3: 1}, max_iter=1, verbose=0, max_size=4, random_state=0) np.testing.assert_raises(ValueError, clf.fit, time_series, y)
class ShapletStateRecognition(BaseMLModelTemplate): def build_model(self, **kwargs): self.his_len = kwargs['his_len'] self.segment_dim = kwargs['segment_dim'] self.model_obj = ShapeletModel(n_shapelets_per_size={self.segment_dim: self.param.n_state}, weight_regularizer=.01, verbose_level=0) def fit(self, x, y=None): self.model_obj.fit(x, y) self.store(self.param.model_save_path) def predict(self, x): self.restore(self.param.model_save_path) shaplets = [] for s in self.model_obj.shapelets_: shaplets.append(s) shaplets = np.reshape(shaplets, [self.param.n_state, self.segment_dim]) print('shaplets:', shaplets.shape) state_pattern = shaplets tmpdata = np.reshape(x, [-1, self.his_len, self.segment_dim]) state_proba = np.zeros([x.shape[0], self.his_len, self.param.n_state], dtype=np.float) for i in range(x.shape[0]): for j in range(self.his_len): for k in range(self.param.n_state): state_proba[i, j, k] = np.sqrt(np.sum(tmpdata[i, j] - shaplets[k]) ** 2) state_proba[i, j] = (state_proba[i, j] - min(state_proba[i, j])) / (max(state_proba[i, j]) - min(state_proba[i, j])) return np.reshape(state_proba, [-1, self.his_len, self.param.n_state]).astype(np.float32), np.array(state_pattern,dtype=np.float32) def store(self, path, **kwargs): save_model_name = "shaplet_{}_{}.state_model".format(self.param.data_name, self.param.n_state) joblib.dump(self.model_obj, os.path.join(path, save_model_name)) def restore(self, path, **kwargs): save_model_name = "shaplet_{}_{}.state_model".format(self.param.data_name, self.param.n_state) self.model_obj = joblib.load(os.path.join(path, save_model_name))
# Get statistics of the dataset n_ts, ts_sz = X_train.shape[:2] n_classes = len(set(y_train)) # Set the number of shapelets per size as done in the original paper shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=0.125, r=1) # Define the model and fit it using the training data shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, weight_regularizer=.01, max_iter=100, verbose=0, random_state=42) shp_clf.fit(X_train, y_train) # Get the number of extracted shapelets, the (minimal) distances from # each of the timeseries to each of the shapelets, and the corresponding # locations (index) where the minimal distance was found n_shapelets = sum(shapelet_sizes.values()) distances = shp_clf.transform(X_train) predicted_locations = shp_clf.locate(X_train) plt.figure() plt.title("Example locations of shapelet matches " "({} shapelets extracted)".format(n_shapelets))
def classify_with_shapelets(): from keras.optimizers import Adagrad from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict feat_path = sys.argv[1] label_type = sys.argv[2] results_base_path = sys.argv[3] data_path = sys.argv[4] pitch_path = os.path.join(data_path, 'pitch.txt') energy_path = os.path.join(data_path, 'energy.txt') #Raw pitch and energy raw_pitch, raw_energy = read_pitch_energy(pitch_path, energy_path) # Tunable Parameters = shapelet length, threshold, shapelet redundancy value # sweep shapelet length pitch_shapelet = {} energy_shapelet = {} for shapelet_len in [10, 25, 50]: for spkr in raw_pitch: # Compute shapelets from raw frames (i.e. no segmented info like where phone/word is) pitch_shapelet[spkr] = compute_shapelet_frame( raw_pitch[spkr], shapelet_len, True) # energy_shapelet[spkr] = compute_shapelet_frame(raw_energy[spkr], shapelet_len) # pitch_shapelet[spkr] = np.array(raw_pitch[spkr]) # print(len(raw_pitch[spkr])) # exit() acc = [] for sim in range(10): y_true = [] y_pred = [] for spkr in tqdm(late_balanced.keys()): test_spkr = [spkr] train_spkrs = late_balanced.keys() train_spkrs.remove(test_spkr[0]) X_train = np.array([ np.array(shapelet).reshape(shapelet_len, 1) for x in train_spkrs for shapelet in pitch_shapelet[x] ]) y_train = np.array([ late_balanced[x] for x in train_spkrs for shapelet in pitch_shapelet[x] ]) X_test = np.array([ np.array(shapelet).reshape(shapelet_len, 1) for x in test_spkr for shapelet in pitch_shapelet[x] ]) y_test = np.array([ late_balanced[x] for x in test_spkr for shapelet in pitch_shapelet[x] ]) # print('train data', X_train.shape) # #print('train data first', X_train[0]) # print('train label', y_train.shape) # exit() shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=X_train.shape[0], ts_sz=X_train.shape[1], n_classes=len(set(y_train)), l=0.1, r=2) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose_level=0) shp_clf.fit(X_train, y_train) predicted_locations = shp_clf.locate(X_test) print('predicted_locations.shape', predicted_locations.shape) # test_ts_id = 0 # plt.figure() # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) # plt.plot(X_test[test_ts_id].ravel()) # for idx_shp, shp in enumerate(shp_clf.shapelets_): # t0 = predicted_locations[test_ts_id, idx_shp] # plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2) # plt.tight_layout() # plt.savefig(test_ts_id+'_test.png', format='png') # exit() prediction = shp_clf.predict(X_test) prediction_prob = shp_clf.predict_proba(X_test) y_pred += prediction.tolist() y_true += y_test.tolist() ###After LOO # test_ts_id = 0 # plt.figure() # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) # plt.plot(X_test[test_ts_id].ravel()) # for idx_shp, shp in enumerate(shp_clf.shapelets_): # t0 = predicted_locations[test_ts_id, idx_shp] # plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2) # plt.tight_layout() # plt.savefig('test.png', format='png') local_acc = balanced_accuracy_score(y_true, y_pred) acc.append(local_acc) # print('acc', acc) # print('final acc', np.mean(acc)) # print('final acc std', np.std(acc)) if not os.path.exists(os.path.join(results_base_path, 'regression')): os.makedirs(os.path.join(results_base_path, 'regression')) results_file = os.path.join(results_base_path, 'regression', 'shapelet_' + str(len_feats) + '.txt') with open(results_file, 'w') as w: # w.write("Confusion Matrix\n") # # w.write(confusion_matrix(y_true, y_pred).tolist()) # w.write('{}\n\n'.format(confusion_matrix(y_true, y_pred))) w.write('regression: {} ({})\n'.format(np.mean(acc_list), np.std(acc_list))) w.write('baseline: {} ({})'.format(np.mean(acc_baseline), np.std(acc_baseline))) w.write("\nFeature Importance\n") for i in tot_importance: tot_importance[i] = np.mean(tot_importance[i]) for i in sorted(tot_importance.items(), key=operator.itemgetter(1), reverse=True): w.write("{} = {}\n".format(i[0], i[1]))
# * genetic algorithm # * learning time series shapelets ge = SingleGeneticExtractor(verbose=True, population_size=100, iterations=1000, wait=100) ge.fit(X, y) gen_shapelet = ge.shapelets[0] bfe = BruteForceExtractor() bf_shapelet = bfe.extract(X, y)[0] clf = ShapeletModel(n_shapelets_per_size={len(ts1): 1}, max_iter=5000, verbose_level=0, batch_size=1, optimizer='sgd', weight_regularizer=0) clf.fit(np.reshape(X, (X.shape[0], X.shape[1], 1)), y) lts_shapelet = clf.shapelets_[0] # Plot the shapelets and orderline fig, ax = plt.subplots(3, 3, sharey=True) ax[0][0].axis('off') ax[0][0].annotate('Brute Force', (0, 0.5), fontsize=24, va='center', ha='left') ax[0][1].axis('off') ax[0][1].plot(range(len(bf_shapelet)), bf_shapelet, c=cmap(0.)) # TODO: if dist_tsx too close to other dist_tsy, then change y-coordinate so that points do not overlap
i = 0 while i < 5: try: # Sample random hyper-parameters for LTS K = np.random.choice([0.05, 0.15, 0.3]) L = np.random.choice([0.025, 0.075, 0.125, 0.175, 0.2]) R = np.random.choice([1, 2, 3]) _lambda = np.random.choice([0.01, 0.1, 1]) n_iterations = np.random.choice([2000, 5000, 10000]) shapelet_dict = grabocka_params_to_shapelet_size_dict( X_train.shape[0], X_train.shape[1], int(K * X_train.shape[1]), L, R) clf = ShapeletModel(n_shapelets_per_size=shapelet_dict, max_iter=n_iterations, verbose_level=0, batch_size=1, optimizer='sgd', weight_regularizer=_lambda) clf.fit(np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)), y_train) X_distances_train = clf.transform(X_train) X_distances_test = clf.transform(X_test) lr = GridSearchCV(LogisticRegression(), { 'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0] }) lr.fit(X_distances_train, y_train)
def test_shapelet_lengths(): pytest.importorskip('tensorflow') from tslearn.shapelets import ShapeletModel # Test variable-length y = [0, 1] time_series = to_time_series_dataset([[1, 2, 3, 4, 5], [3, 2, 1]]) clf = ShapeletModel(n_shapelets_per_size={3: 1}, max_iter=1, verbose=0, random_state=0) clf.fit(time_series, y) weights_shapelet = [np.array([[1, 2, 3]])] clf.set_weights(weights_shapelet, layer_name="shapelets_0_0") tr = clf.transform(time_series) np.testing.assert_allclose(tr, np.array([[0.], [8. / 3]])) # Test max_size to predict longer series than those passed at fit time y = [0, 1] time_series = to_time_series_dataset([[1, 2, 3, 4, 5], [3, 2, 1]]) clf = ShapeletModel(n_shapelets_per_size={3: 1}, max_iter=1, verbose=0, max_size=6, random_state=0) clf.fit(time_series[:, :-1], y) # Fit with size 4 weights_shapelet = [np.array([[1, 2, 3]])] clf.set_weights(weights_shapelet, layer_name="shapelets_0_0") tr = clf.transform(time_series) np.testing.assert_allclose(tr, np.array([[0.], [8. / 3]]))
def build_model(self, **kwargs): self.his_len = kwargs['his_len'] self.segment_dim = kwargs['segment_dim'] self.model_obj = ShapeletModel(n_shapelets_per_size={self.segment_dim: self.param.n_state}, weight_regularizer=.01, verbose_level=0)
def test_shapelets(): pytest.importorskip('tensorflow') from tslearn.shapelets import ShapeletModel import tensorflow as tf n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) cross_validate(clf, time_series, y, cv=2) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer=tf.optimizers.Adam(.1), random_state=0) cross_validate(clf, time_series, y, cv=2) model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1) model.fit(time_series, y) for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_): np.testing.assert_allclose(shp, to_time_series(shp_bis, remove_nans=True)) # Test set_weights / get_weights clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, random_state=0) clf.fit(time_series, y) preds_before = clf.predict_proba(time_series) weights = clf.get_weights() # Change number of iterations, then refit, then set weights clf.max_iter *= 2 clf.fit(time_series, y) clf.set_weights(weights) np.testing.assert_allclose(preds_before, clf.predict_proba(time_series))
n_ts, ts_sz = X_train.shape[:2] n_classes = len(set(y_train)) # Set the number of shapelets per size as done in the original paper shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=0.1, r=1) # Define the model using parameters provided by the authors (except that we # use fewer iterations here) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=tf.optimizers.Adam(.01), batch_size=16, weight_regularizer=.01, max_iter=200, random_state=42, verbose=0) shp_clf.fit(X_train, y_train) # Make predictions and calculate accuracy score pred_labels = shp_clf.predict(X_test) print("Correct classification rate:", accuracy_score(y_test, pred_labels)) # Plot the different discovered shapelets plt.figure() for i, sz in enumerate(shapelet_sizes.keys()): plt.subplot(len(shapelet_sizes), 1, i + 1) plt.title("%d shapelets of size %d" % (shapelet_sizes[sz], sz)) for shp in shp_clf.shapelets_:
n_ts, ts_sz = X_train.shape[:2] n_classes = len(set(y_train)) # Set the number of shapelets per size as done in the original paper shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=0.1, r=2) # Define the model using parameters provided by the authors (except that we use # fewer iterations here) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=200, verbose=0) shp_clf.fit(X_train, y_train) predicted_labels = shp_clf.predict(X_test) print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) plt.figure() for i, sz in enumerate(shapelet_sizes.keys()): plt.subplot(len(shapelet_sizes), 1, i + 1) plt.title("%d shapelets of size %d" % (shapelet_sizes[sz], sz)) for shp in shp_clf.shapelets_: if ts_size(shp) == sz: plt.plot(shp.ravel()) plt.xlim([0, max(shapelet_sizes.keys()) - 1])
import matplotlib.pyplot as plt from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.shapelets import ShapeletModel numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) shapelet_sizes = {50: 3} shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=300, verbose_level=0) shp_clf.fit(X_train, y_train) predicted_locations = shp_clf.locate(X_test) test_ts_id = 0 plt.figure() plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) plt.plot(X_test[test_ts_id].ravel()) for idx_shp, shp in enumerate(shp_clf.shapelets_): t0 = predicted_locations[test_ts_id, idx_shp] plt.plot(numpy.arange(t0, t0 + len(shp)), shp, linewidth=2) plt.tight_layout()
def fit(self, X, y): """Fit the model using X as training data and y as target values Parameters ---------- X : {array-like} Training data. Shape [n_samples, n_features]. y : {array-like, sparse matrix} Target values of shape = [n_samples] or [n_samples, n_outputs] """ self.X = X self.y = y n_shapelets_per_size = self.shapelet_model_params.get( "n_shapelets_per_size", "heuristic") if n_shapelets_per_size == "heuristic": n_ts, ts_sz = X.shape[:2] n_classes = len(set(y)) n_shapelets_per_size = grabocka_params_to_shapelet_size_dict( n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=self.shapelet_model_params.get("l", 0.1), r=self.shapelet_model_params.get("r", 2)) shp_clf = ShapeletModel( n_shapelets_per_size=n_shapelets_per_size, optimizer=self.shapelet_model_params.get("optimizer", "sgd"), weight_regularizer=self.shapelet_model_params.get( "weight_regularizer", .01), max_iter=self.shapelet_model_params.get("max_iter", 100), random_state=self.random_state, verbose=self.shapelet_model_params.get("verbose", 0)) shp_clf.fit(X, y) X_transformed = shp_clf.transform(X) self.X_transformed = X_transformed if self.tau is not None: self.X_thresholded = 1 * (self.X_transformed < self.tau) clf = DecisionTreeClassifier() param_grid = self.decision_tree_grid_search_params grid = GridSearchCV(clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=0) grid.fit(self.X_thresholded, y) else: grids = [] grids_scores = [] for quantile in self.tau_quantiles: _X_thresholded = 1 * (self.X_transformed < (np.quantile( self.X_transformed, quantile))) clf = DecisionTreeClassifier() param_grid = self.decision_tree_grid_search_params grid = GridSearchCV(clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=0) grid.fit(_X_thresholded, y) grids.append(grid) grids_scores.append(grid.best_score_) grid = grids[np.argmax(np.array(grids_scores))] best_quantile = self.tau_quantiles[np.argmax( np.array(grids_scores))] self.tau = np.quantile(self.X_transformed, best_quantile) self.X_thresholded = 1 * (self.X_transformed < self.tau) clf = DecisionTreeClassifier(**grid.best_params_) clf.fit(self.X_thresholded, y) if self.prune_duplicate_tree_leaves: prune_duplicate_leaves( clf) # FIXME: does it influence the .tree properties? self.decision_tree = clf self.decision_tree_explorable = NewTree(clf) self.decision_tree_explorable.build_tree() self._shapelet_model = shp_clf self._build_tree_graph() return self
class Shapelets(): def __init__(self, epochs, length, num_shapelet_lengths, num_shapelets, learning_rate, weight_regularizer, batch_size=256, optimizer=Adam): ''' initialize shapelet hyperparameters hyperparameters: epochs : number of training epochs length : base shapelet length, expressed as fraction of length of time series num_shapelet_lengths : number of different shapelet lengths num_shapelets : number of unique shapelets to learn at each shapelet length, expressed as fraction of length of time series learning rate : learning rate of Keras optimizer weight regularizer : weight regularization used when fitting model ''' self.epochs = epochs self.length = length self.num_shapelet_lengths = num_shapelet_lengths self.num_shapelets = num_shapelets self.weight_regularizer = weight_regularizer self.batch_size = batch_size self.optimizer = optimizer(lr=learning_rate) self.shapelet_sizes = None self.shapelet_clf = None self.encoder = LabelEncoder() def clear_session(self): try: assert (self.shapelet_clf is not None) except: raise ValueError( "Cannot clear session that has not been initialized") self.shapelet_clf.clear_session() return def load_model(self, series_length, labels, checkpoint): ''' Load model from checkpoint into Shapelet classifier ''' if self.shapelet_clf is None: base_size = int(self.length * series_length) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * series_length) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # first generate new model into which to load the weights self.encode(labels) self.shapelet_clf.generate_model(series_length, len(self.get_classes())) # load weights self.shapelet_clf.model.load_weights(checkpoint) def fit_transfer_model(self, X_train, y_train, checkpoint, nclasses_prior=2, source_dir=None, val_data=None): # encode training and validation labels y_train = self.encode(y_train) y_val = self.encode(val_data[1]) # scale training and validation data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) X_val_scaled = self.__ScaleData(val_data[0]) if self.shapelet_clf is None: base_size = int(self.length * X_train.shape[1]) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * X_train.shape[1]) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # fit shapelet classifier self.shapelet_clf.fit_transfer_model(X_train_scaled, y_train, nclasses_prior, checkpoint, source_dir, (X_val_scaled, y_val)) def fit(self, X_train, y_train, source_dir=None, val_data=None): ''' fit shapelet classifier on training data parameters: X_train : training time series y_train : training labels ''' if self.shapelet_clf is None: base_size = int(self.length * X_train.shape[1]) self.shapelet_sizes = {} for sz_idx in range(self.num_shapelet_lengths): shp_sz = base_size * (sz_idx + 1) self.shapelet_sizes[shp_sz] = int(self.num_shapelets * X_train.shape[1]) self.shapelet_clf = ShapeletModel( n_shapelets_per_size=self.shapelet_sizes, optimizer=self.optimizer, weight_regularizer=self.weight_regularizer, max_iter=self.epochs, batch_size=self.batch_size) # encode training and validation labels y_train = self.encode(y_train) y_val = self.encode(val_data[1]) # scale training and validation data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) X_val_scaled = self.__ScaleData(val_data[0]) # fit classifier self.shapelet_clf.fit(X_train_scaled, y_train, source_dir, (X_val_scaled, y_val)) def __ScaleData(self, input_data): ''' scale input data to range [0,1] parameters: input_data : input data to rescale ''' return TimeSeriesScalerMinMax().fit_transform(input_data) def predict(self, X_test): ''' classifications for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' X_test_scaled = self.__ScaleData(X_test) return self.shapelet_clf.predict(X_test_scaled) def predict_proba(self, X_test): ''' class probabilities for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' X_test_scaled = self.__ScaleData(X_test) return self.shapelet_clf.predict_proba(X_test_scaled) def encode(self, categories): ''' fit label encoder on input categories. returns transformed categories ''' self.encoder.fit(categories) return self.encoder.transform(categories) def decode(self, y_probs, p_threshold): ''' decode prediction probabilities y_probs into prediction / confidence give p_threshold ''' prob_max = np.amax(y_probs, axis=1) prediction_indices = prob_max > p_threshold y_pred = np.zeros(y_probs.shape[0]) # reintepret confidence in binary case if y_probs.shape[1] == 1: y_pred[prediction_indices] = 1 confidence = (prob_max - p_threshold) / (y_pred - p_threshold) confidence = 0.5 + confidence / 2 else: y_pred[prediction_indices] = np.argmax(y_probs, axis=1)[prediction_indices] confidence = prob_max y_pred = y_pred.astype(int) y_preds = self.encoder.inverse_transform(y_pred) return y_preds, confidence def get_classes(self): ''' get original classes from encoder ''' try: assert (self.encoder is not None) except: raise ValueError("Encoder has not been initialized") return self.encoder.classes_ def VisualizeShapelets(self): ''' visualize all of shapelets learned by shapelet classifier ''' plt.figure() for i, sz in enumerate(self.shapelet_sizes.keys()): plt.subplot(len(self.shapelet_sizes), 1, i + 1) plt.title("%d shapelets of size %d" % (self.shapelet_sizes[sz], sz)) for shapelet in self.shapelet_clf.shapelets_: if ts_size(shapelet) == sz: plt.plot(shapelet.ravel()) plt.xlim([0, max(self.shapelet_sizes.keys())]) plt.tight_layout() plt.show() def VisualizeShapeletLocations(self, series_values, series_id, save_dir='visualizations', name='shp_1'): ''' visualize shapelets superimposed on one of the test series parameters: series_values: raw values on which to visualize shapelets series_id: id of time series to visualize save_dir directory in which to save visualizations name name under which to save viz (bc unique every time) n_shapelets: ''' plt.style.use("seaborn-whitegrid") # NK brand colors COLORS = [ "#FA5655", "#F79690", "#B9BC2D", "#86B6B2", "#955B99", "#252B7A" ] # others? "#8D6B2C", # "#D0A826", # "#FEDB03", # "#000000", # "#454545", # "#FFFFFF", # "#F8F6F1"] n_rows, n_cols, _ = series_values.shape test_series = series_values[series_id].reshape(-1, ) closest_inds = self.shapelet_clf.locate(test_series.reshape(1, -1, 1))[0] closest_dists = [] for ind, shp in zip(closest_inds, self.shapelet_clf.shapelets_): closest_dists.append( np.linalg.norm(test_series[ind:ind + shp.shape[0]] - shp)) closest_dists = np.array(closest_dists) # convert distance to weight where dist=0 -> wt=1 and dist=inf -> wt=0 sl_weights = 1 / (1 + closest_dists) # plot the signal with matching shapelets color overlayed plt.clf() plt.plot(range(n_cols), test_series, color="k") for ind, sl, wt, color in zip(closest_inds, self.shapelet_clf.shapelets_, sl_weights, COLORS): # find closest match t = range(ind, ind + sl.shape[0]) match = test_series[ind:ind + sl.shape[0]] # plot shapelet on top of signal width width and alpha set by shapelet weight plt.plot(t, match, alpha=7 * wt, linewidth=35 * wt, color=color) plt.ylabel('Email Density') plt.xlabel('Minute of the Hour') plt.show() #plt.savefig(save_dir + "/{}_signal_size_{}_id_{}.png".format(name, n_cols, series_id)) # plot shapelets plt.clf() # to plot the shapelets, switch to dark background plt.style.use("seaborn-darkgrid") # ax = plt.axes() # used below for sharex, sharey (if needed?) # arange shapletes in grid - find greatest factor of n_shapelets gf = 0 shp_t = self.shapelet_clf.shapelets_as_time_series_ shp = self.shapelet_clf.shapelets_ for i in range(1, shp.shape[0]): if shp.shape[0] % i == 0: gf = i of = int(shp.shape[0] / gf) n_cols = 2 for i in range(shp_t.shape[0]): ax_i = plt.subplot(gf, of, i + 1) # we could force them to share the same axes # ax_i = plt.subplot(n_rows, n_cols, i + 1, sharex=ax, sharey=ax) #ax_i.set_xticklabels([]) ax_i.set_yticklabels([]) plt.plot(range(shp_t.shape[1]), shp[i].reshape(-1), color=COLORS[i % len(COLORS)], linewidth=3) plt.xlabel('Shapelet Length') plt.show()