def cluster(timeseries_df,data_labels): Shapelet_list = [] D = '/home/abhilash/Datasets/UCRArchive_2018/TwoLeadECG/TXT_Files/' for i in range(1,max(data_labels)+1): print('Class',i) ts_df=timeseries_df[timeseries_df['0']==i] ts_df=ts_df.reset_index(drop=True) labels = ts_df['0'] ts_df = ts_df.drop(ts_df.columns[0], axis=1) S='class'+str(i)+'labels.txt' pred_label=pd.read_csv(D+S,header=None) pred_label=np.ravel(np.array(pred_label)) shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=ts_df.shape[0], ts_sz=ts_df.shape[1], n_classes=2, l=0.36, r=1) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose=0) shp_clf.fit(ts_df, pred_label) shapelets=shp_clf.shapelets_; temp_list=[] for i in range(0, shapelets.shape[0]): temp= shapelets[i].T temp_list.append(temp[0]) Shapelet_list.append(temp_list) return Shapelet_list
def cluster(shape,timeseries_df,data_labels,k): Shapelet_list = [] for i in range(1,max(data_labels)+1): ts_df=timeseries_df[timeseries_df['0']==i] ts_df=ts_df.reset_index(drop=True) # cluster_list.append(extractU_Shapelets(shape) labels = ts_df['0'] ts_df = ts_df.drop(ts_df.columns[0], axis=1) S=uShapeletClustering.extractU_Shapelets.extract_Shapelets(ts_df.copy(),shape,k) S = np.array(list(S)) pred_label=uShapeletClustering.Kmeans.Kmeans(ts_df.copy(),S,k,labels) shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=ts_df.shape[0], ts_sz=ts_df.shape[1], n_classes=2, l=0.5, r=1) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose=0) shp_clf.fit(ts_df, pred_label) shapelets=shp_clf.shapelets_; temp_list=[] for i in range(0, shapelets.shape[0]): temp= shapelets[i].T temp_list.append(temp[0]) Shapelet_list.append(temp_list) return Shapelet_list
def learningShapeletClassifier(X_train, Y_train): shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=X_train.shape[0], ts_sz=X_train.shape[1], n_classes=len(set(Y_train)), l=0.1, r=2) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=200, verbose_level=0) shp_clf.fit(X_train, Y_train) return shp_clf
def executeLearningShapelet(datasetName): # INPUT: Dataset name # Execution of a ShapeletTransformation algorithm over the dataset: datasetName X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) # RE-SIZE BY FUN X TRAIN dfTrain = computeLoadedDataset(X_train, y_train) y_train = dfTrain['target'].values y_train = y_train.astype(int) #get the number of classes le = LabelEncoder() distinct_classes = le.fit_transform(dfTrain['target']) distinct_classes = np.unique(distinct_classes, return_counts=False) num_classes = len(distinct_classes) print(distinct_classes) print(num_classes) del dfTrain['target'] del dfTrain['TsIndex'] # RE-SIZE BY FUN X TEST dfTest = computeLoadedDataset(X_test, y_test) y_test = dfTest['target'].values y_test = y_test.astype(int) del dfTest['target'] del dfTest['TsIndex'] # inizio preprocessing train start_timePreprocessingTrain = time.time() shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=len(dfTrain), ts_sz=len(dfTrain.iloc[0]), n_classes=num_classes, l=0.1, # parametri fissi r=1) grabocka = LearningShapelets(n_shapelets_per_size=shapelet_sizes) grabocka.fit(dfTrain, y_train) X_train_distances = grabocka.transform(dfTrain) # fine preprocessing train PreprocessingTrainTime = time.time() - start_timePreprocessingTrain # inizio train start_timeTrain = time.time() dt = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=20) dt.fit(X_train_distances, y_train) # fine train TrainTime = time.time() - start_timeTrain # inizio preprocessing test start_timePreprocessingTest = time.time() X_test_distances = grabocka.transform(dfTest) # fine preprocessing test PreprocessingTestTime = time.time() - start_timePreprocessingTest # inizio test start_timeTest = time.time() y_predict = dt.predict(X_test_distances) # fine test TestTime = time.time() - start_timeTest print(accuracy_score(y_test, y_predict)) row = [ 'LearningShapelets', datasetName, round(accuracy_score(y_test, y_predict), 2), round(PreprocessingTrainTime, 2), round(TrainTime, 2), round(PreprocessingTestTime, 2), round(TestTime, 2) ] WriteCsvShapeletAlgo('Shapelet_Algo_Experiments_29-12.csv', row)
from tslearn.shapelets import ShapeletModel, \ grabocka_params_to_shapelet_size_dict from tslearn.utils import ts_size numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) n_ts, ts_sz = X_train.shape[:2] n_classes = len(set(y_train)) # Set the number of shapelets per size as done in the original paper shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=0.1, r=2) # Define the model using parameters provided by the authors (except that we use # fewer iterations here) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=200, verbose=0) shp_clf.fit(X_train, y_train) predicted_labels = shp_clf.predict(X_test) print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) plt.figure()
import matplotlib.pyplot as plt from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict from tslearn.utils import ts_size numpy.random.seed(0) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) # Set the number of shapelets per size as done in the original paper shapelet_sizes = grabocka_params_to_shapelet_size_dict(ts_sz=X_train.shape[1], n_classes=len( set(y_train)), l=0.1, r=2) # Define the model using parameters provided by the authors (except that we use fewer iterations here) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=300, verbose_level=0) shp_clf.fit(X_train, y_train) predicted_labels = shp_clf.predict(X_test) print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) plt.figure() for i, sz in enumerate(shapelet_sizes.keys()):
def classify_with_shapelets(): from keras.optimizers import Adagrad from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict feat_path = sys.argv[1] label_type = sys.argv[2] results_base_path = sys.argv[3] data_path = sys.argv[4] pitch_path = os.path.join(data_path, 'pitch.txt') energy_path = os.path.join(data_path, 'energy.txt') #Raw pitch and energy raw_pitch, raw_energy = read_pitch_energy(pitch_path, energy_path) # Tunable Parameters = shapelet length, threshold, shapelet redundancy value # sweep shapelet length pitch_shapelet = {} energy_shapelet = {} for shapelet_len in [10, 25, 50]: for spkr in raw_pitch: # Compute shapelets from raw frames (i.e. no segmented info like where phone/word is) pitch_shapelet[spkr] = compute_shapelet_frame( raw_pitch[spkr], shapelet_len, True) # energy_shapelet[spkr] = compute_shapelet_frame(raw_energy[spkr], shapelet_len) # pitch_shapelet[spkr] = np.array(raw_pitch[spkr]) # print(len(raw_pitch[spkr])) # exit() acc = [] for sim in range(10): y_true = [] y_pred = [] for spkr in tqdm(late_balanced.keys()): test_spkr = [spkr] train_spkrs = late_balanced.keys() train_spkrs.remove(test_spkr[0]) X_train = np.array([ np.array(shapelet).reshape(shapelet_len, 1) for x in train_spkrs for shapelet in pitch_shapelet[x] ]) y_train = np.array([ late_balanced[x] for x in train_spkrs for shapelet in pitch_shapelet[x] ]) X_test = np.array([ np.array(shapelet).reshape(shapelet_len, 1) for x in test_spkr for shapelet in pitch_shapelet[x] ]) y_test = np.array([ late_balanced[x] for x in test_spkr for shapelet in pitch_shapelet[x] ]) # print('train data', X_train.shape) # #print('train data first', X_train[0]) # print('train label', y_train.shape) # exit() shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=X_train.shape[0], ts_sz=X_train.shape[1], n_classes=len(set(y_train)), l=0.1, r=2) shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose_level=0) shp_clf.fit(X_train, y_train) predicted_locations = shp_clf.locate(X_test) print('predicted_locations.shape', predicted_locations.shape) # test_ts_id = 0 # plt.figure() # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) # plt.plot(X_test[test_ts_id].ravel()) # for idx_shp, shp in enumerate(shp_clf.shapelets_): # t0 = predicted_locations[test_ts_id, idx_shp] # plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2) # plt.tight_layout() # plt.savefig(test_ts_id+'_test.png', format='png') # exit() prediction = shp_clf.predict(X_test) prediction_prob = shp_clf.predict_proba(X_test) y_pred += prediction.tolist() y_true += y_test.tolist() ###After LOO # test_ts_id = 0 # plt.figure() # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values())) # plt.plot(X_test[test_ts_id].ravel()) # for idx_shp, shp in enumerate(shp_clf.shapelets_): # t0 = predicted_locations[test_ts_id, idx_shp] # plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2) # plt.tight_layout() # plt.savefig('test.png', format='png') local_acc = balanced_accuracy_score(y_true, y_pred) acc.append(local_acc) # print('acc', acc) # print('final acc', np.mean(acc)) # print('final acc std', np.std(acc)) if not os.path.exists(os.path.join(results_base_path, 'regression')): os.makedirs(os.path.join(results_base_path, 'regression')) results_file = os.path.join(results_base_path, 'regression', 'shapelet_' + str(len_feats) + '.txt') with open(results_file, 'w') as w: # w.write("Confusion Matrix\n") # # w.write(confusion_matrix(y_true, y_pred).tolist()) # w.write('{}\n\n'.format(confusion_matrix(y_true, y_pred))) w.write('regression: {} ({})\n'.format(np.mean(acc_list), np.std(acc_list))) w.write('baseline: {} ({})'.format(np.mean(acc_baseline), np.std(acc_baseline))) w.write("\nFeature Importance\n") for i in tot_importance: tot_importance[i] = np.mean(tot_importance[i]) for i in sorted(tot_importance.items(), key=operator.itemgetter(1), reverse=True): w.write("{} = {}\n".format(i[0], i[1]))
def fit(self, X, y): """Fit the model using X as training data and y as target values Parameters ---------- X : {array-like} Training data. Shape [n_samples, n_features]. y : {array-like, sparse matrix} Target values of shape = [n_samples] or [n_samples, n_outputs] """ self.X = X self.y = y n_shapelets_per_size = self.shapelet_model_params.get( "n_shapelets_per_size", "heuristic") if n_shapelets_per_size == "heuristic": n_ts, ts_sz = X.shape[:2] n_classes = len(set(y)) n_shapelets_per_size = grabocka_params_to_shapelet_size_dict( n_ts=n_ts, ts_sz=ts_sz, n_classes=n_classes, l=self.shapelet_model_params.get("l", 0.1), r=self.shapelet_model_params.get("r", 2)) shp_clf = ShapeletModel( n_shapelets_per_size=n_shapelets_per_size, optimizer=self.shapelet_model_params.get("optimizer", "sgd"), weight_regularizer=self.shapelet_model_params.get( "weight_regularizer", .01), max_iter=self.shapelet_model_params.get("max_iter", 100), random_state=self.random_state, verbose=self.shapelet_model_params.get("verbose", 0)) shp_clf.fit(X, y) X_transformed = shp_clf.transform(X) self.X_transformed = X_transformed if self.tau is not None: self.X_thresholded = 1 * (self.X_transformed < self.tau) clf = DecisionTreeClassifier() param_grid = self.decision_tree_grid_search_params grid = GridSearchCV(clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=0) grid.fit(self.X_thresholded, y) else: grids = [] grids_scores = [] for quantile in self.tau_quantiles: _X_thresholded = 1 * (self.X_transformed < (np.quantile( self.X_transformed, quantile))) clf = DecisionTreeClassifier() param_grid = self.decision_tree_grid_search_params grid = GridSearchCV(clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=0) grid.fit(_X_thresholded, y) grids.append(grid) grids_scores.append(grid.best_score_) grid = grids[np.argmax(np.array(grids_scores))] best_quantile = self.tau_quantiles[np.argmax( np.array(grids_scores))] self.tau = np.quantile(self.X_transformed, best_quantile) self.X_thresholded = 1 * (self.X_transformed < self.tau) clf = DecisionTreeClassifier(**grid.best_params_) clf.fit(self.X_thresholded, y) if self.prune_duplicate_tree_leaves: prune_duplicate_leaves( clf) # FIXME: does it influence the .tree properties? self.decision_tree = clf self.decision_tree_explorable = NewTree(clf) self.decision_tree_explorable.build_tree() self._shapelet_model = shp_clf self._build_tree_graph() return self