def __init__(self, train, name): loader = UCR_UEA_datasets() projector = None if train: data, targets, _, _ = loader.load_dataset(name) else: _, _, data, targets = loader.load_dataset(name) if name in {"PEMS-SF", "Libras"}: targets = targets.astype(float).astype(int) - 1 else: projector = {k: i for i, k in enumerate(np.unique(targets))} targets = np.array(list(map(lambda n: projector[n], targets))) data = torch.tensor(data.transpose(0, 2, 1).astype(np.float32)) targets = torch.tensor(targets.astype(int)) super(TSULUEADataset, self).__init__(data, targets) self.n_features = data.shape[1] self.n_targets = len(np.unique(targets)) self.n_targets = len(np.unique(self.target)) if projector is not None: self.projector = {k: v for v, k in projector.items()} else: self.projector = projector
def test_openDataset(): from tslearn.datasets import UCR_UEA_datasets X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( "GunPoint") print("Train shape", X_train.shape) print("Test shape", X_test.shape) print(set(y_train)) dl = getDataLoader(data=X_train, label=y_train) print("nb data", len(dl.dataset)) plt.plot(range(X_train.shape[1]), X_train[0, :] / X_train.max()) plt.plot(range(X_train.shape[1]), X_train[25, :] / X_train.max()) plt.show()
def get_italypower_dataset(filename): X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( 'ItalyPowerDemand') X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1])) n_timestamps = X_train.shape[1] window_sizes = [2, 4, 6] window_steps = [1, 1, 1] return X_train, X_test, y_train, y_test, n_timestamps, window_sizes, window_steps
def get_electricdevices_dataset(filename): X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( 'ElectricDevices') X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1])) n_timestamps = X_train.shape[1] window_sizes = [4, 8, 16] window_steps = [1, 2, 4] return X_train, X_test, y_train, y_test, n_timestamps, window_sizes, window_steps
def get_phalanges_dataset(filename): X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( 'PhalangesOutlinesCorrect') X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1])) n_timestamps = X_train.shape[1] window_sizes = [4, 8, 16] window_steps = [1, 2, 4] return X_train, X_test, y_train, y_test, n_timestamps, window_sizes, window_steps
def get_gunpoint_dataset(filename): X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( 'GunPoint') X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1])) n_timestamps = X_train.shape[1] window_sizes = [4, 8, 16] window_steps = [1, 2, 4] return X_train, X_test, y_train, y_test, n_timestamps, window_sizes, window_steps
def fetch_dataset( name: str, ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]: loader = UCR_UEA_datasets() x_train, y_train, x_test, y_test = loader.load_dataset(name) x_train = x_train.astype(np.float32) x_test = x_test.astype(np.float32) def projector(targets): if name in {"PEMS-SF", "Libras"}: targets = targets.astype(float).astype(int) - 1 elif name in {"UWaveGestureLibraryAll"}: targets = targets - 1 else: project = {k: i for i, k in enumerate(np.unique(targets))} targets = np.array(list(map(lambda n: project[n], targets))) return targets y_train = projector(y_train).astype(np.int64) y_test = projector(y_test).astype(np.int64) return (x_train, y_train), (x_test, y_test)
def __init__(self, train=True, name: str = "RacketSports"): loader = UCR_UEA_datasets() projector = None if train: data, targets, _, _ = loader.load_dataset(name) else: _, _, data, targets = loader.load_dataset(name) if name in {"PEMS-SF", "Libras"}: targets = targets.astype(float).astype(int) - 1 elif name in {"UWaveGestureLibraryAll"}: targets = targets - 1 else: projector = {k: i for i, k in enumerate(np.unique(targets))} targets = np.array(list(map(lambda n: projector[n], targets))) self.data = data.transpose(0, 2, 1).astype(np.float32) self.targets = targets.astype(int) self.n_features = self.data.shape[1] self.n_targets = len(np.unique(self.targets)) if projector is not None: self.projector = {k: v for v, k in projector.items()} else: self.projector = projector
def executeDecisionTreeStandard(datasetName): X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) #pre processing phase dfTrain = computeLoadedDataset(X_train, y_train) del dfTrain["TsIndex"] del dfTrain["target"] print(dfTrain) dfTest = computeLoadedDataset(X_test, y_test) y_test = y_test.astype('int') del dfTest["target"] del dfTest["TsIndex"] print(dfTest) #test phase clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=20) start_timeTrain = time.time() clf.fit(dfTrain, y_train) TrainTime = time.time() - start_timeTrain # Training phase time start_timeTest = time.time() y_predTest = clf.predict(dfTest) TestTime = time.time() - start_timeTest print(classification_report(y_test, y_predTest)) print('Accuracy %s' % accuracy_score(y_test, y_predTest)) print('F1-score %s' % f1_score(y_test, y_predTest, average=None)) confusion_matrix(y_test, y_predTest) row = [ 'Decision tree classifier', datasetName, round(accuracy_score(y_test, y_predTest), 2), round(TrainTime, 2), round(TestTime, 2) ] WriteCsvComparison('Algorithms_Experiments_29-12.csv.csv', row)
def plotTs(datasetName): # INPUT: Dataset # OUTPUT: Plot of all the Ts in the training dataset X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) dfTrain = computeLoadedDataset(X_train, y_train) le = LabelEncoder() num_classes = le.fit_transform(dfTrain['target']) plt.scatter( dfTrain['att0'], dfTrain['att1'], c=num_classes ) # scatter mi permette di "disegnare" il piano in 2d, mettendo attributi, e avere graficamente classificazione lineare plt.show() for i in range(len(dfTrain)): Ts = np.array(dfTrain.iloc[i].values) print('TS ID:' + str(i)) print('TS CLASS:' + str(dfTrain.iloc[i]['target'])) plotData(dfTrain.iloc[i])
def executeMAPIC(useValidationSet, usePercentageTrainingSet, datasetName, nameFile): #,initialWS,candidate): #INPUT: Parameters for TSCMP algorithm #Execution of a TSCMP test over the dataset: datasetName first = True # Generation & Computation of the training dataset second = True # Fittin of the Decision Tree third = True # Generation & Computation of the testing dataset quarter = True # Predict and show scores fifth = True # Plot some/all classified instances sixth = True # Plot of the choosen shapelet PercentageTrainingSet = 1 # variable percentage of the training set PercentageValidationSet = 0.3 # percentage of the training set chosen as validation set writeOnCsv = True le = LabelEncoder() # candidatesGroup=1,maxDepth=3,minSamplesLeaf=20,removeUsedCandidate=1,window_size=20,k=2,useClustering=True,n_clusters=20,warningDetected=False,verbose=0 tree = Tree(candidatesGroup=1, maxDepth=3, minSamplesLeaf=20, removeUsedCandidate=1, window_size=20, k=2, useClustering=True, n_clusters=20, warningDetected=False, verbose=0) if (first == True): verbose = False X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) if (verbose): print('Initial Train set shape : ' + str(X_train.shape) + '\n') print('Initial Test set shape : ' + str(X_test.shape) + '\n') if (useValidationSet ): #extract the validation set from the training set dimValidationSet = int(len(X_train) * PercentageValidationSet) selectedRecordsForValidation = sample_without_replacement( len(X_train), dimValidationSet) # get useful dataset dfTrain = computeLoadedDataset(X_train, y_train) patternLenght = len(dfTrain.iloc[0]) - 1 #estraggo val set e rimuovo record estratti da training set dfVal = dfTrain.iloc[selectedRecordsForValidation] dfTrain = dfTrain.drop(index=selectedRecordsForValidation) if (verbose): print('Patter Lenght: ' + str(patternLenght) + '\n') print('Final Train set shape : ' + str(dfTrain.shape)) print('Final Validation set shape : ' + str(dfVal.shape) + '\n') num_classes = le.fit_transform(dfVal['target']) if (verbose): print('Final class distribution in Validation set : ') print(np.unique(num_classes, return_counts=True)) print('\n') num_classes = le.fit_transform(dfTrain['target']) if (verbose): print('Final class distribution in Training set : ') print(np.unique(num_classes, return_counts=True)) print('\n') print('dfTrain: \n' + str(dfTrain)) print(dfTrain.isnull().sum().sum()) print(dfTrain.isnull().values.any()) print('dfVal: \n' + str(dfVal)) if (usePercentageTrainingSet ): #extract only a percentage from the training set dimSubTrainSet = int( len(X_train) * PercentageTrainingSet) # dim of new SubSet of X_train selectedRecords = sample_without_replacement( len(X_train), dimSubTrainSet) # random records selected if (verbose): print('selectedRecords: ' + str(selectedRecords)) #inserisco in df Training set con relative label dfTrain = computeLoadedDataset(X_train, y_train) patternLenght = len(dfTrain.iloc[0]) - 1 dfTrain = dfTrain.iloc[selectedRecords].copy() if (verbose): print('Final Train set shape : ' + str(dfTrain.shape) + '\n') num_classes = le.fit_transform(dfTrain['target']) if (verbose): print('Final class distribution in Training set : ') print(np.unique(num_classes, return_counts=True)) print('\n') print('PATT LENGHT: ' + str(patternLenght)) # generate the TSCMP dataset from the oringial training dataset start_timePreprocessingTrain = time.time() tree.dfTrain = dfTrain OriginalCandidatesListTrain, numberOfMotifTrain, numberOfDiscordTrain = getDataStructures( tree, dfTrain, tree.window_size, tree.k, verbose) #select only the type of candidates chosen by the user if (tree.candidatesGroup == 0): OriginalCandidatesListTrain = OriginalCandidatesListTrain[ OriginalCandidatesListTrain['M/D'] == 0] if (tree.candidatesGroup == 1): OriginalCandidatesListTrain = OriginalCandidatesListTrain[ OriginalCandidatesListTrain['M/D'] == 1] OriginalCandidatesListTrain.reset_index(drop=True) #add structures to tree tree.OriginalCandidatesUsedListTrain = buildCandidatesUsedList( OriginalCandidatesListTrain) tree.OriginalCandidatesListTrain = OriginalCandidatesListTrain if (verbose): print('OriginalCandidatesUsedListTrain: \n') print(tree.OriginalCandidatesUsedListTrain) print('OriginalCandidatesListTrain: \n') print(tree.OriginalCandidatesListTrain) #OriginalCandidatesListTrain remains the same during all the execution #prepare data srtucture for the execution by appliyng K-Medoids initially if (tree.useClustering): CandidatesListTrain = reduceNumberCandidates( tree, OriginalCandidatesListTrain, returnOnlyIndex=False) if (verbose): print('candidati rimasti/ più significativi-distintivi ') else: CandidatesListTrain = tree.OriginalCandidatesListTrain if (verbose): print(CandidatesListTrain) TsIndexList = dfTrain[ 'TsIndex'].values #initially (first iteration) consider all the Ts # compute the euclidean dist btw each Ts and each chosen candidate dfForDTree = computeSubSeqDistance(tree, TsIndexList, CandidatesListTrain, tree.window_size) PreprocessingTrainTime = time.time() - start_timePreprocessingTrain if (verbose == True): print('dfTrain: \n' + str(dfTrain)) print('dfForDTree: \n' + str(dfForDTree)) if (second == True): verbose = False start_timeTrain = time.time() #fit the Decision Tree tree.fit(dfForDTree, verbose=False) TrainTime = time.time( ) - start_timeTrain #take the training phase time print(tree.Root) if (verbose == True): print(tree.attributeList) tree.printAll(tree.Root) if (len(tree.SseList) > 0): avgSSE = sum(tree.SseList) / len(tree.SseList) else: avgSSE = 0 if (len(tree.IterationList) > 0): avgIteration = sum(tree.IterationList) / len(tree.IterationList) else: avgIteration = 0 if (third == True): verbose = False #Generate the test dataset if (useValidationSet): dfTest = dfVal else: dfTest = computeLoadedDataset(X_test, y_test) if (tree.verbose): print('DF TEST') print(dfTest) start_timePreprocessingTest = time.time() tree.attributeList = sorted( tree.attributeList ) #ordino attributi per rendere più efficiente 'computeSubSeqDistanceForTest' tree.attributeList = np.unique(tree.attributeList) CandidatesListMatched = tree.OriginalCandidatesListTrain[ 'IdCandidate'].isin( tree.attributeList ) # set to true the candidate index chosen by the Decision Tree tree.dTreeAttributes = tree.OriginalCandidatesListTrain[ CandidatesListMatched] # extract the candidates chosen by the Decision Tree if (tree.verbose): print('Attributi selezionati dal Decision Tree') print(tree.dTreeAttributes) dfForDTreeTest = computeSubSeqDistanceForTest(tree, dfTest, tree.dTreeAttributes) PreprocessingTestTime = time.time() - start_timePreprocessingTest if (tree.verbose == True): print(dfForDTreeTest) if (quarter == True): # Make prediction and show the results verbose = False #generate the dataset for plotting the classified instances tree.TsTestForPrint = list() temp = list() for i in range(len(dfTest)): temp = dfTest.iloc[ i].values # contiene la prima serie che viene classificata temp = temp[:len(temp) - 2] tree.TsTestForPrint.append(temp) temp = None start_timeTest = time.time() yTest, yPredicted = tree.predict(dfForDTreeTest, tree.Root, fifth) TestTime = time.time() - start_timeTest if (tree.verbose == True): for a, b in zip(yTest, yPredicted): print(a, b) cR = classification_report(yTest, yPredicted) aS = accuracy_score(yTest, yPredicted) f1 = f1_score(yTest, yPredicted, average=None) confusion_matrix(yTest, yPredicted) if (tree.candidatesGroup == 0): group = 'Motifs' elif (tree.candidatesGroup == 1): group = 'Discords' else: group = 'Both' if (useValidationSet): percentage = PercentageValidationSet elif (usePercentageTrainingSet): percentage = PercentageTrainingSet row = [ datasetName, group, tree.maxDepth, tree.minSamplesLeaf, tree.window_size, tree.removeUsedCandidate, tree.k, useValidationSet, percentage, tree.useClustering, tree.n_clusters, round(aS, 2), round(PreprocessingTrainTime, 2), round(TrainTime, 2), round(PreprocessingTestTime, 2), round(TestTime, 2) ] #row = ['MAPIC', datasetName, round(aS,2), round(PreprocessingTrainTime,2),round(TrainTime,2),round(PreprocessingTestTime,2),round(TestTime,2),round(avgSSE),round(avgIteration)] print('Classification Report \n%s ' % cR) print('Accuracy %s' % aS) print('F1-score %s' % f1) #COMMENTO PER STAMPARE SU CONFRONTO ALGO if (writeOnCsv): WriteCsvMAPIC('parametri_mapic.csv', row) if (sixth == True): #extract and plot the shapelet chosen by the Decision Tree for i in range(len(tree.dTreeAttributes)): idTs = tree.dTreeAttributes.iloc[i]['IdTs'] idCandidate = tree.dTreeAttributes.iloc[i]['IdCandidate'] sp = tree.dTreeAttributes.iloc[i]['startingPosition'] md = tree.dTreeAttributes.iloc[i]['M/D'] ts = np.array(tree.dfTrain[tree.dfTrain['TsIndex'] == idTs].values) ts = ts[0] ts = ts[:len(ts) - 2] tupla = retrieve_all(tree, ts, tree.window_size, tree.k) mp, mot, motif_dist, dis = tupla if (verbose == True): print('IdTs: %d' % idTs) print('IDCandidate: %d' % idCandidate) print('starting position: %d ' % sp) print('M/D: %d ' % md) plot_all(ts, mp, mot, motif_dist, dis, sp, md, tree.window_size, idCandidate)
""" Inspired from a gallery example in tslearn: <https://tslearn.readthedocs.io/en/latest/auto_examples/plot_shapelet_locations.html> """ import numpy import matplotlib.pyplot as plt from keras.optimizers import Adagrad from tslearn.preprocessing import TimeSeriesScalerMinMax from tslearn.datasets import UCR_UEA_datasets from tslearn.shapelets import ShapeletModel seed = 0 numpy.random.seed(seed) ds_name = "EarthQuakes" X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset(ds_name) X_train = TimeSeriesScalerMinMax().fit_transform(X_train) X_test = TimeSeriesScalerMinMax().fit_transform(X_test) n_ts, ts_sz = X_train.shape[:2] n_classes = len(set(y_train)) n_shapelets = 5 sz_shapelets = int(0.1 * ts_sz) shapelet_sizes = {sz_shapelets: n_shapelets} test_ts_id = 0 yrange = [-.5, 1.5] # LS figure shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes, optimizer=Adagrad(lr=.1), weight_regularizer=.01, max_iter=50, verbose_level=0,
#method1 = 'genetic' #method2 = 'learned' #method1_name = 'GENDIS' #method2_name = 'LTS' # Comment this out if you want to process results of dependent vs independent DIR = 'results/dependent_vs_independent/' method1 = 'tree' method2 = 'transform' method1_name = 'dependent' method2_name = 'independent' accuracies1 = defaultdict(list) accuracies2 = defaultdict(list) data_loader = UCR_UEA_datasets() # Iterate over files in directory, process the predictions (_proba) and save # them in a dict. Afterwards, print a table with aggregated results & create # scatter plot (stat test if |values| > 1) datasets = set([x.split('_')[0] for x in os.listdir(DIR) if x != '.keep']) for dataset in datasets: glob_path = DIR + '{}_{}*lr_proba.csv' method1_files = glob.glob(glob_path.format(dataset, method1)) method2_files = glob.glob(glob_path.format(dataset, method2)) # First, we load the ground truth, needed to calculate accuracy _, _, _, ground_truth = data_loader.load_dataset(dataset)
# from enchanter.callbacks import TensorBoardLogger as Experiment import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader from sklearn.svm import SVC from tslearn.datasets import UCR_UEA_datasets from enchanter.addons import layers as L from enchanter.callbacks import EarlyStoppingForTSUS from enchanter.tasks import TimeSeriesUnsupervisedRunner from enchanter.engine.modules import fix_seed from enchanter.utils.datasets import TimeSeriesLabeledDataset fix_seed(800) downloader = UCR_UEA_datasets() x_train, y_train, x_test, y_test = downloader.load_dataset("Libras") x_train = torch.tensor(x_train.transpose(0, 2, 1), dtype=torch.float32) x_test = torch.tensor(x_test.transpose(0, 2, 1), dtype=torch.float32) y_train = y_train.astype(float).astype(int) - 1 y_train = torch.tensor(y_train, dtype=torch.long) y_test = y_test.astype(float).astype(int) - 1 y_test = torch.tensor(y_test, dtype=torch.long) class Encoder(nn.Module): def __init__(self, in_features, mid_features, out_features, representation_size): super(Encoder, self).__init__() self.conv = nn.Sequential(
""" Implementation of the Deep Temporal Clustering model Dataset loading functions """ import numpy as np from tslearn.datasets import UCR_UEA_datasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance from sklearn.preprocessing import LabelEncoder ucr = UCR_UEA_datasets() # try to use UCR/UEA univariate and multivariate datasets. # requires forked version of tslearn from https://github.com/yichangwang/tslearn try: all_ucr_datasets = ucr.list_datasets() + ucr._multivariate_dataset except AttributeError: all_ucr_datasets = ucr.list_datasets() def load_ucr(dataset='CBF'): X_train, y_train, X_test, y_test = ucr.load_dataset(dataset) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) if dataset == 'HandMovementDirection': # this one has special labels y = [yy[0] for yy in y] y = LabelEncoder().fit_transform( y) # sometimes labels are strings or start from 1 assert (y.min() == 0) # assert labels are integers and start from 0 # preprocess data (standardization) X_scaled = TimeSeriesScalerMeanVariance().fit_transform(X)
""" Implementation of the Deep Temporal Clustering model Dataset loading functions @author Florent Forest (FlorentF9) """ import numpy as np from tslearn.datasets import UCR_UEA_datasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance from sklearn.preprocessing import LabelEncoder ucr = UCR_UEA_datasets() # UCR/UEA univariate and multivariate datasets. all_ucr_datasets = ucr.list_datasets() def load_ucr(dataset='CBF'): X_train, y_train, X_test, y_test = ucr.load_dataset(dataset) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) if dataset == 'HandMovementDirection': # this one has special labels y = [yy[0] for yy in y] y = LabelEncoder().fit_transform( y) # sometimes labels are strings or start from 1 assert (y.min() == 0) # assert labels are integers and start from 0 # preprocess data (standardization) X_scaled = TimeSeriesScalerMeanVariance().fit_transform(X) return X_scaled, y
ofp.write(str(np.reshape(shap, (-1))) + '\n') with open(timing_out_path, 'w+') as ofp: ofp.write(str(genetic_time)) X_distances_train = genetic_extractor.transform(X_train) X_distances_test = genetic_extractor.transform(X_test) #fit_lr(X_distances_train, y_train, X_distances_test, y_test, pred_out_path) #fit_rf(X_distances_train, y_train, X_distances_test, y_test, pred_out_path) #fit_svm(X_distances_train, y_train, X_distances_test, y_test, pred_out_path) fit_voting(X_distances_train, y_train, X_distances_test, y_test, pred_out_path) data_loader = UCR_UEA_datasets() datasets = ['Beef', 'OSULeaf', 'ScreenType', 'Adiac', 'Fish', 'Car', 'Ham'] done = [ 'Beef', 'OSULeaf', 'ScreenType', 'Adiac', 'Fish', 'Car', 'Ham', 'Worms', 'RefrigerationDevices', 'ChlorineConcentration', 'CricketZ', 'Wine', 'CricketY', 'ArrowHead', 'BirdChicken', 'SmallKitchenAppliances', 'Haptics', 'ShapesAll', 'ElectricDevices', 'FordA', 'Herring', 'SwedishLeaf', 'CricketX', 'SonyAIBORobotSurface2', 'InsectWingbeatSound', 'WormsTwoClass', 'Computers', 'TwoLeadECG', 'ToeSegmentation1', 'GunPoint', 'OliveOil', 'LargeKitchenAppliances', 'UWaveGestureLibraryY', 'MoteStrain', 'FaceAll', 'ProximalPhalanxOutlineCorrect', 'FordB', 'Coffee', 'ToeSegmentation2', 'Strawberry', 'Plane', 'DistalPhalanxOutlineCorrect', 'FacesUCR', 'MiddlePhalanxOutlineCorrect', 'BeetleFly', 'UWaveGestureLibraryX', 'Wafer', 'UWaveGestureLibraryZ', 'ECG5000', 'CBF',
from tslearn.datasets import UCR_UEA_datasets print(len(UCR_UEA_datasets().list_datasets()))
def transform_dataset(time_series, shapelets): n_ts, sz, d = time_series.shape n_shapelets = len(shapelets) X = numpy.zeros((n_ts, 2 * n_shapelets)) for i in range(n_ts): X[i] = extract_shapelet_match(time_series[i], shapelets) return X shapelet_size_ratio = 0.1 output_folder = "/Volumes/DATA/features/" do_not_process = True for dataset_name in UCR_UEA_datasets(use_cache=False).list_datasets(): if dataset_name == "NonInvasiveFetalECGThorax2": do_not_process = False if do_not_process: continue X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( dataset_name) if X_train is None or X_test is None: print("Skipping dataset %s: invalid files" % dataset_name) continue n_ts, sz, d = X_train.shape shapelets = load_timeseries_txt("shapelets/%s_%s.txt" % (dataset_name, str(shapelet_size_ratio))) numpy.savetxt( "%s/%s_%s_TRAIN.txt" % (output_folder, dataset_name, str(shapelet_size_ratio)),
def executeKNN(datasetName): X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) scalerMM = MinMaxScaler() scalerS = StandardScaler() scalerUsed = 1 if (scalerUsed == 1): scaler = scalerMM else: scaler = scalerS K = 3 # pre processing phase Training set dfTrain = computeLoadedDataset(X_train, y_train) del dfTrain["TsIndex"] del dfTrain["target"] print(dfTrain) # pre processing phase Test set dfTest = computeLoadedDataset(X_test, y_test) y_test = y_test.astype('int') del dfTest["target"] del dfTest["TsIndex"] print(dfTest) # test phase knn = KNeighborsClassifier(n_neighbors=K) start_timePreprocessingTrain = time.time() dfTrain[dfTrain.columns] = scaler.fit_transform(dfTrain) PreProcessingTrainTime = time.time( ) - start_timePreprocessingTrain # Training phase time start_timeTrain = time.time() knn.fit(dfTrain, y_train) TrainTime = time.time() - start_timeTrain # Training phase time # prediction on the test test start_timeTest = time.time() dfTest[dfTest.columns] = scaler.fit_transform(dfTest) test_pred_knn = knn.predict(dfTest) TestTime = time.time() - start_timeTrain print(classification_report(y_test, test_pred_knn)) print('Accuracy %s' % accuracy_score(y_test, test_pred_knn)) print('F1-score %s' % f1_score(y_test, test_pred_knn, average=None)) confusion_matrix(y_test, test_pred_knn) row = [ 'KNN', datasetName, round(accuracy_score(y_test, test_pred_knn), 2), round(PreProcessingTrainTime, 2), round(TrainTime, 2), round(TestTime, 2) ] WriteCsvComparison('KNN_Experiments_04-01.csv', row)
def executeClassicDtree(datasetName): # INPUT: Dataset # Execution of the DecisionTreeClassifier algorithm over the dataset: datasetName # NB: IN ORDER TO MAKE A VALID COMPARISON WITH TSCMP, THESE VALUES OF THE PARAMETERS MUST BE THE SAME OF THE VALUE CHOSEN IN TSCMP tree = Tree(candidatesGroup=1, maxDepth=3, minSamplesLeaf=20, removeUsedCandidate=1, window_size=20, k=2, useClustering=True, n_clusters=20, warningDetected=False, verbose=0) verbose = False #SAME INITIALIZATION AND DATA STRUCTURE GENERATION OF TSCMP le = LabelEncoder() X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) dfTrain = computeLoadedDataset(X_train, y_train) # inizio preprocessing train start_timePreprocessingTrain = time.time() tree.dfTrain = dfTrain OriginalCandidatesListTrain, numberOfMotifTrain, numberOfDiscordTrain = getDataStructures( tree, dfTrain, tree.window_size, tree.k, verbose=False) if (tree.candidatesGroup == 0): OriginalCandidatesListTrain = OriginalCandidatesListTrain[ OriginalCandidatesListTrain['M/D'] == 0] if (tree.candidatesGroup == 1): OriginalCandidatesListTrain = OriginalCandidatesListTrain[ OriginalCandidatesListTrain['M/D'] == 1] OriginalCandidatesListTrain.reset_index(drop=True) tree.OriginalCandidatesUsedListTrain = buildCandidatesUsedList( OriginalCandidatesListTrain) tree.OriginalCandidatesListTrain = OriginalCandidatesListTrain if (verbose): print('OriginalCandidatesUsedListTrain: \n') print(tree.OriginalCandidatesUsedListTrain) print('OriginalCandidatesListTrain: \n') print(tree.OriginalCandidatesListTrain) if (tree.useClustering): CandidatesListTrain = reduceNumberCandidates( tree, OriginalCandidatesListTrain, returnOnlyIndex=False) if (verbose): print('candidati rimasti/ più significativi-distintivi ') else: CandidatesListTrain = tree.OriginalCandidatesListTrain if (verbose): print(CandidatesListTrain) TsIndexList = dfTrain['TsIndex'].values dfForDTree = computeSubSeqDistance(tree, TsIndexList, CandidatesListTrain, tree.window_size) # fine preprocessing train PreprocessingTrainTime = time.time() - start_timePreprocessingTrain if (verbose == True): print('dfTrain: \n' + str(dfTrain)) print('dfForDTree: \n' + str(dfForDTree)) #ESTRAGGO LA COLONNA LABEL #RIMUOVO COLONNA LABEL E TSINDEX INUTILI QUI y_train = dfForDTree['class'] del dfForDTree["class"] del dfForDTree["TsIndex"] y_train = y_train.astype('int') #print(dfForDTree) # NB: IN ORDER TO MAKE A VALID COMPARISON WITH TSCMP, THESE VALUES OF THE PARAMETERS MUST BE THE SAME OF THE VALUE CHOSEN IN TSCMP # inizio train start_timeTrain = time.time() clf = DecisionTreeClassifier( criterion='entropy', max_depth=3, min_samples_leaf=20 ) # fissando random state ho sempre lo stesso valore e non ho ranodmicità nello split clf.fit(dfForDTree, y_train) # fine train TrainTime = time.time() - start_timeTrain #INIZIO STESSA PROCEDURA EPR GENERARE dfForDTreeTest dfTest = computeLoadedDataset(X_test, y_test) # inizio preprocessing test start_timePreprocessingTest = time.time() columns = dfForDTree.columns.values tree.attributeList = columns CandidatesListMatched = tree.OriginalCandidatesListTrain['IdCandidate'].isin( tree.attributeList ) # mi dice quali TsIndex in OriginalCandidatesListTrain sono contenuti in Dleft tree.dTreeAttributes = tree.OriginalCandidatesListTrain[ CandidatesListMatched] dfForDTreeTest = computeSubSeqDistanceForTest(tree, dfTest, tree.dTreeAttributes) # fine preprocessing test PreprocessingTestTime = time.time() - start_timePreprocessingTest y_test = dfForDTreeTest["class"].values y_test = y_test.astype('int') del dfForDTreeTest["class"] print(dfForDTreeTest) # inizio test start_timeTest = time.time() y_predTest = clf.predict(dfForDTreeTest) # fine test TestTime = time.time() - start_timeTest print(classification_report(y_test, y_predTest)) print('Accuracy %s' % accuracy_score(y_test, y_predTest)) print('F1-score %s' % f1_score(y_test, y_predTest, average=None)) confusion_matrix(y_test, y_predTest) row = [ 'Decision Tree with Shapelet', datasetName, round(accuracy_score(y_test, y_predTest), 2), round(PreprocessingTrainTime, 2), round(TrainTime, 2), round(PreprocessingTestTime, 2), round(TestTime, 2) ] WriteCsvShapeletAlgo('Shapelet_Algo_Experiments_29-12.csv', row)
def executeShapeletTransform(datasetName): # INPUT: Dataset name # Execution of a ShapeletTransformation algorithm over the dataset: datasetName X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) #RE-SIZE BY FUN X TRAIN dfTrain = computeLoadedDataset(X_train, y_train) y_train = dfTrain['target'].values y_train = y_train.astype(int) del dfTrain['target'] del dfTrain['TsIndex'] # RE-SIZE BY FUN X TEST dfTest = computeLoadedDataset(X_test, y_test) y_test = dfTest['target'].values y_test = y_test.astype(int) del dfTest['target'] del dfTest['TsIndex'] #inizio preprocessing train start_timePreprocessingTrain = time.time() #Shapelet transformation WITH RANDOM STATE #NB: IN ORDER TO MAKE A VALID COMPARISON WITH TSCMP, THE WINDOW SIZE VALUE MUST BE THE SAME OF THE VALUE CHOSEN IN TSCMP st = ShapeletTransform(window_sizes=[20], sort=True) X_new = st.fit_transform(dfTrain, y_train) # fine preprocessing train PreprocessingTrainTime = time.time() - start_timePreprocessingTrain from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=20) # inizio train start_timeTrain = time.time() clf.fit(X_new, y_train) # fine train TrainTime = time.time() - start_timeTrain # inizio preprocessing test start_timePreprocessingTest = time.time() X_test_new = st.transform(dfTest) # fine preprocessing test PreprocessingTestTime = time.time() - start_timePreprocessingTest # inizio test start_timeTest = time.time() y_pred = clf.predict(X_test_new) # fine test TestTime = time.time() - start_timeTest print(accuracy_score(y_test, y_pred)) row = [ 'ShapeletTransformation', datasetName, round(accuracy_score(y_test, y_pred), 2), round(PreprocessingTrainTime, 2), round(TrainTime, 2), round(PreprocessingTestTime, 2), round(TestTime, 2) ] WriteCsvShapeletAlgo('Shapelet_Algo_Experiments_29-12.csv', row)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: tung doan """ import numpy as np import matplotlib.pyplot as plt from tslearn.datasets import UCR_UEA_datasets from tmf import tmf """ load data """ data_loader = UCR_UEA_datasets() X_tr, y_tr, X_te, y_te = data_loader.load_dataset('Coffee') X = X_tr[:, ::2, 0] #reduce length a factor of 2 for fast demo y = y_tr # Ground truth indicator matrix grd = np.zeros((y.size, y.max() + 1)) grd[np.arange(y.size), y] = 1 """ run temporal matrix factorization """ k = y.max() + 1 l = X.shape[1] lambda_1 = lambda_2 = 1e-2 lambda_3 = 10 sigma = 0.05**2 eta = 1e-2 o_max = 15 i_max = 50 F_list, G_list = tmf(X, k, l, lambda_1, lambda_2, lambda_3, sigma, eta, o_max, i_max) """ plot """ plt.style.use(style='ggplot')
s = '|' s += '{:>20}|'.format(dataset) s += '{:>12}|'.format(sax_error) s += '{:>12}|'.format(time_sax) s += '{:>12}|'.format(eucl_error) s += '{:>12}|'.format(time_euclidean) print(s.strip()) print('-' * (len(columns) * 13 + 22)) # Set seed numpy.random.seed(0) # Defining dataset and the number of segments data_loader = UCR_UEA_datasets() datasets = [('SyntheticControl', 16), ('GunPoint', 64), ('FaceFour', 128), ('Lightning2', 256), ('Lightning7', 128), ('ECG200', 32), ('Plane', 64), ('Car', 256), ('Beef', 128), ('Coffee', 128), ('OliveOil', 256)] # We will compare the accuracies & execution times of 1-NN using: # (i) MINDIST on SAX representations, and # (ii) euclidean distance on raw values knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax') knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean') accuracies = {} times = {} for dataset, w in datasets: X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset)
def executeLearningShapelet(datasetName): # INPUT: Dataset name # Execution of a ShapeletTransformation algorithm over the dataset: datasetName X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset( datasetName) # RE-SIZE BY FUN X TRAIN dfTrain = computeLoadedDataset(X_train, y_train) y_train = dfTrain['target'].values y_train = y_train.astype(int) #get the number of classes le = LabelEncoder() distinct_classes = le.fit_transform(dfTrain['target']) distinct_classes = np.unique(distinct_classes, return_counts=False) num_classes = len(distinct_classes) print(distinct_classes) print(num_classes) del dfTrain['target'] del dfTrain['TsIndex'] # RE-SIZE BY FUN X TEST dfTest = computeLoadedDataset(X_test, y_test) y_test = dfTest['target'].values y_test = y_test.astype(int) del dfTest['target'] del dfTest['TsIndex'] # inizio preprocessing train start_timePreprocessingTrain = time.time() shapelet_sizes = grabocka_params_to_shapelet_size_dict( n_ts=len(dfTrain), ts_sz=len(dfTrain.iloc[0]), n_classes=num_classes, l=0.1, # parametri fissi r=1) grabocka = LearningShapelets(n_shapelets_per_size=shapelet_sizes) grabocka.fit(dfTrain, y_train) X_train_distances = grabocka.transform(dfTrain) # fine preprocessing train PreprocessingTrainTime = time.time() - start_timePreprocessingTrain # inizio train start_timeTrain = time.time() dt = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=20) dt.fit(X_train_distances, y_train) # fine train TrainTime = time.time() - start_timeTrain # inizio preprocessing test start_timePreprocessingTest = time.time() X_test_distances = grabocka.transform(dfTest) # fine preprocessing test PreprocessingTestTime = time.time() - start_timePreprocessingTest # inizio test start_timeTest = time.time() y_predict = dt.predict(X_test_distances) # fine test TestTime = time.time() - start_timeTest print(accuracy_score(y_test, y_predict)) row = [ 'LearningShapelets', datasetName, round(accuracy_score(y_test, y_predict), 2), round(PreprocessingTrainTime, 2), round(TrainTime, 2), round(PreprocessingTestTime, 2), round(TestTime, 2) ] WriteCsvShapeletAlgo('Shapelet_Algo_Experiments_29-12.csv', row)
def list_UCR_datasets(): return UCR_UEA_datasets().list_datasets()
class UCRDataset(torch.utils.data.Dataset): """ A torch wrapper around tslearn UCR_Datasets datasets """ def __init__(self, name, partition="train", ratio=.75, randomstate=None, silent=True, augment_data_noise=0): r = np.random.RandomState(seed=randomstate) self.name = name self.dataset = UCR_UEA_datasets() self.augment_data_noise = augment_data_noise if name not in self.dataset.list_datasets(): raise ValueError("Dataset not found: Please choose from " + ", ".join(self.dataset.list_datasets())) X_trainvalid, y_trainvalid, X_test, y_test = self.dataset.load_dataset( name) self.nclasses = len(np.unique(np.append(y_test, y_trainvalid, axis=0))) self.ndims = 1 # UCR datasets have one featuredimension train_mask = r.rand(len(X_trainvalid)) < ratio valid_mask = np.logical_not(train_mask) if partition == "train": self.X = X_trainvalid[train_mask] self.y = y_trainvalid[train_mask] elif partition == "valid": self.X = X_trainvalid[valid_mask] self.y = y_trainvalid[valid_mask] elif partition == "trainvalid": self.X = X_trainvalid self.y = y_trainvalid elif partition == "test": self.X = X_test self.y = y_test else: raise ValueError( "Invalid partition! please provide either 'train','valid', 'trainvalid', or 'test'" ) # some binary datasets e.g. EGC200 or Lightning 2 have classes: -1, 1 -> clipping to 1:2 if self.y.min() < 0: if not silent: print("Found class ids < 0 in dataset. clipping to zero!") self.y = np.clip(self.y, 0, None) # some datasets (e.g. Coffee) have classes with zero index while all other start with 1... if self.y.min() > 0: if not silent: print( "Found class id starting from 1. reducing all class ids by one to start from zero" ) self.y -= 1 #self.classes = np.unique(self.y) self.sequencelength = X_trainvalid.shape[1] if not silent: msg = "Loaded dataset {}-{} T={}, classes={}: {}/{} samples" print( msg.format(name, partition, self.sequencelength, self.nclasses, len(self.X), len(X_trainvalid) + len(X_test))) def __len__(self): return self.X.shape[0] def __getitem__(self, idx): X = self.X[idx] X += np.random.rand(*X.shape) * self.augment_data_noise X = torch.from_numpy(X).type(torch.FloatTensor) y = torch.from_numpy(np.array([self.y[idx]])).type(torch.LongTensor) # add 1d hight and width dimensions and copy y for each time return X, y.expand(X.shape[0]), idx def __str__(self): str = """ UCR Dataset = {dataset} X.shape = {Xshape} y.shape = {yshape} nclasses = {nclasses} ndims = {ndims} """.format(dataset=self.name, Xshape=self.X.shape, yshape=self.y.shape, nclasses=self.nclasses, ndims=self.ndims) return str
def __init__(self, name, partition="train", ratio=.75, randomstate=None, silent=True, augment_data_noise=0): r = np.random.RandomState(seed=randomstate) self.name = name self.dataset = UCR_UEA_datasets() self.augment_data_noise = augment_data_noise if name not in self.dataset.list_datasets(): raise ValueError("Dataset not found: Please choose from " + ", ".join(self.dataset.list_datasets())) X_trainvalid, y_trainvalid, X_test, y_test = self.dataset.load_dataset( name) self.nclasses = len(np.unique(np.append(y_test, y_trainvalid, axis=0))) self.ndims = 1 # UCR datasets have one featuredimension train_mask = r.rand(len(X_trainvalid)) < ratio valid_mask = np.logical_not(train_mask) if partition == "train": self.X = X_trainvalid[train_mask] self.y = y_trainvalid[train_mask] elif partition == "valid": self.X = X_trainvalid[valid_mask] self.y = y_trainvalid[valid_mask] elif partition == "trainvalid": self.X = X_trainvalid self.y = y_trainvalid elif partition == "test": self.X = X_test self.y = y_test else: raise ValueError( "Invalid partition! please provide either 'train','valid', 'trainvalid', or 'test'" ) # some binary datasets e.g. EGC200 or Lightning 2 have classes: -1, 1 -> clipping to 1:2 if self.y.min() < 0: if not silent: print("Found class ids < 0 in dataset. clipping to zero!") self.y = np.clip(self.y, 0, None) # some datasets (e.g. Coffee) have classes with zero index while all other start with 1... if self.y.min() > 0: if not silent: print( "Found class id starting from 1. reducing all class ids by one to start from zero" ) self.y -= 1 #self.classes = np.unique(self.y) self.sequencelength = X_trainvalid.shape[1] if not silent: msg = "Loaded dataset {}-{} T={}, classes={}: {}/{} samples" print( msg.format(name, partition, self.sequencelength, self.nclasses, len(self.X), len(X_trainvalid) + len(X_test)))
trained_models = {} # define grid-search hyperparameters for SVC (common to all kernels) svc_parameters = {'C': np.logspace(0, 4, 5), 'gamma': list(np.logspace(-4, 4, 9)) + ['auto']} _sigmas = [1e-3, 5e-3, 1e-2, 2.5e-2, 5e-2, 7.5e-2, 1e-1, 2.5e-1, 5e-1, 7.5e-1, 1., 2., 5., 10.] _scales = [5e-2, 1e-1, 5e-1, 1e0] # start grid-search datasets = tqdm(_datasets, position=0, leave=True) for name in datasets: # record best scores in training phase best_scores_train = {k : 0. for k in _kernels} # lead-lag only if number of channels is <= 5 x_train, _, _, _ = UCR_UEA_datasets(use_cache=True).load_dataset(name) if x_train.shape[1] <= 200 and x_train.shape[2] <= 8: transforms = tqdm([(True,True), (False,True), (True,False), (False,False)], position=1, leave=False) else: # do not try lead-lag as dimension is already high transforms = tqdm([(True,False), (False,False)], position=1, leave=False) # grid-search for path-transforms (add-time, lead-lag) for (at,ll) in transforms: transforms.set_description(f"add-time: {at}, lead-lag: {ll}") # load train data x_train, y_train, _, _ = UCR_UEA_datasets(use_cache=True).load_dataset(name) x_train /= x_train.max() # encode outputs as labels y_train = LabelEncoder().fit_transform(y_train)
linestyle="dashed", color=color, linewidth=1.5) plt.axvline(x=t, color=color, linewidth=1.5) plt.text(x=t - 20, y=time_series.max() - .25, s="Prediction time") plt.title("Sample of class {} predicted as class {}".format( y_true, y_pred)) plt.xlim(0, time_series.shape[0] - 1) ############################################################################## # Data loading and visualization # ------------------------------ numpy.random.seed(0) X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("ECG200") # Scale time series X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train) X_test = TimeSeriesScalerMeanVariance().fit_transform(X_test) size = X_train.shape[1] n_classes = len(set(y_train)) plt.figure() for i, cl in enumerate(set(y_train)): plt.subplot(n_classes, 1, i + 1) for ts in X_train[y_train == cl]: plt.plot(ts.ravel(), color="orange" if cl > 0 else "blue", alpha=.3) plt.xlim(0, size - 1) plt.suptitle("Training time series")