def test_loading(): #test multivariate #Test univariate for i in range(0, len(univariate_datasets)): data_dir = "E:/tsc_ts/" dataset = univariate_datasets[i] trainX, trainY = load_ts(data_dir + dataset + '/' + dataset + '_TRAIN.ts') testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts') print("Loaded " + dataset + " in position " + str(i)) print("Train X shape :") print(trainX.shape) print("Train Y shape :") print(trainY.shape) print("Test X shape :") print(testX.shape) print("Test Y shape :") print(testY.shape) for i in range(16, len(multivariate_datasets)): data_dir = "E:/mtsc_ts/" dataset = multivariate_datasets[i] print("Loading " + dataset + " in position " + str(i) + ".......") trainX, trainY = load_ts(data_dir + dataset + '/' + dataset + '_TRAIN.ts') testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts') print("Loaded " + dataset) print("Train X shape :") print(trainX.shape) print("Train Y shape :") print(trainY.shape) print("Test X shape :") print(testX.shape) print("Test Y shape :") print(testY.shape)
def shapelet_extraction(timeseries_dir, data_dir, data_name, shp_type, seed, l_norm): trainX, trainY = load_ts(timeseries_dir + data_name + '/' + data_name + '_TRAIN.ts') testX, testY = load_ts(timeseries_dir + data_name + '/' + data_name + '_TEST.ts') # Encoding of the labels from 1 to num_classes of the dataset. le = LabelEncoder() trainY = le.fit_transform(trainY) testY = le.transform(testY) trainY = trainY + 1 testY = testY + 1 if shp_type == "Standard": shp = ContractedShapeletTransform(time_limit_in_mins=0.1, random_state=seed) else: shp = ContractedOrdinalShapeletTransform(time_limit_in_mins=0.1, quality=shp_type, random_state=seed, l_norm=l_norm) shp.fit(trainX, trainY) shapelets = shp.get_shapelets() writeShapeletsToCSV(shapelets, 60, data_dir + '/' + data_name + '/' + data_name + '_shapelets.csv') for i in range(90, 110, 10): directory = data_dir + '/' + data_name + '/' + 'transform_' + str(i) + '/' if not os.path.exists(directory): os.makedirs(directory) shp.shapelets = shapelets[:int((len(shapelets)*i)/100)] train_transform = shp.transform(trainX) test_transform = shp.transform(testX) train_transform['label'] = trainY test_transform['label'] = testY train_transform.to_csv(directory + '/' + data_name + '_train.0', header=None, index=None, sep=' ') test_transform.to_csv(directory + '/' + data_name + '_test.0', header=None, index=None, sep=' ')
def test_loading(): #test multivariate #Test univariate data_dir = "E:/tsc_ts/" dataset = "Gunpoint" trainX, trainY = load_ts(data_dir + dataset + '/' + dataset + '_TRAIN.ts') testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts') print("Loaded " + dataset + " in position " + str(i))
current_frame_size = (1 - remaining) # if the last frame was lost due to double imprecision if current_frame == self.num_intervals - 1: frames.append(frame_sum / frame_length) data.append(pd.Series(frames)) dims[0] = data return dims if __name__ == "__main__": testPath = "C:\\Users\\ajb\\Dropbox\\Data\\TSCProblems\\Chinatown\\Chinatown_TRAIN.ts" train_x, train_y = load_ts(testPath) print("Correctness testing for PAA using Chinatown") # print("First case used for testing") # print(train_x.iloc[0,0]) p = PAA() print("Test 1: num intervals =1, result should be series mean") p.set_num_intervals(1) x2 = p.transform(train_x) print("Correct mean case 1: = 561.875") print("Transform mean case 1: =") print(x2.iloc[0, 0]) print("Test 2: num intervals = series length, series should be unchanged") p.set_num_intervals(24) x2 = p.transform(train_x) print("Before transform: =")
def run_experiment(problem_path, results_path, cls_name, dataset, classifier=None, resampleID=0, overwrite=False, format=".ts", train_file=False): """ Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required, trainFold<resampleID>.csv. :param problem_path: Location of problem files, full path. :param results_path: Location of where to write results. Any required directories will be created :param cls_name: determines which classifier to use, as defined in set_classifier. This assumes predict_proba is implemented, to avoid predicting twice. May break some classifiers though :param dataset: Name of problem. Files must be <problem_path>/<dataset>/<dataset>+"_TRAIN"+format, same for "_TEST" :param resampleID: Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. :param overwrite: if set to False, this will only build results if there is not a result file already present. If True, it will overwrite anything already there :param format: Valid formats are ".ts", ".arff" and ".long". For more info on format, see https://github.com/alan-turing-institute/sktime/blob/master/examples/Loading%20Data%20Examples.ipynb :param train_file: whether to generate train files or not. If true, it performs a 10xCV on the train and saves :return: """ build_test = True if not overwrite: full_path = str(results_path) + "/" + str( cls_name) + "/Predictions/" + str(dataset) + "/testFold" + str( resampleID) + ".csv" if os.path.exists(full_path): print( full_path + " Already exists and overwrite set to false, not building Test" ) build_test = False if train_file: full_path = str(results_path) + "/" + str( cls_name) + "/Predictions/" + str( dataset) + "/trainFold" + str(resampleID) + ".csv" if os.path.exists(full_path): print( full_path + " Already exists and overwrite set to false, not building Train" ) train_file = False if train_file == False and build_test == False: return # TO DO: Automatically differentiate between problem types, currently only works with .ts trainX, trainY = load_ts(problem_path + dataset + '/' + dataset + '_TRAIN' + format) testX, testY = load_ts(problem_path + dataset + '/' + dataset + '_TEST' + format) trainX = _normalise_X(trainX) testX = _normalise_X(testX) if resampleID != 0: # allLabels = np.concatenate((trainY, testY), axis = None) # allData = pd.concat([trainX, testX]) # train_size = len(trainY) / (len(trainY) + len(testY)) # trainX, testX, trainY, testY = train_test_split(allData, allLabels, train_size=train_size, # random_state=resampleID, shuffle=True, # stratify=allLabels) trainX, trainY, testX, testY = stratified_resample( trainX, trainY, testX, testY, resampleID) le = preprocessing.LabelEncoder() le.fit(trainY) trainY = le.transform(trainY) testY = le.transform(testY) if classifier is None: classifier = set_classifier(cls_name, resampleID) print(cls_name + " on " + dataset + " resample number " + str(resampleID)) if build_test: # TO DO : use sklearn CV start = int(round(time.time() * 1000)) classifier.fit(trainX, trainY) build_time = int(round(time.time() * 1000)) - start start = int(round(time.time() * 1000)) probs = classifier.predict_proba(testX) preds = classifier.classes_[np.argmax(probs, axis=1)] test_time = int(round(time.time() * 1000)) - start ac = accuracy_score(testY, preds) print(cls_name + " on " + dataset + " resample number " + str(resampleID) + ' test acc: ' + str(ac) + ' time: ' + str(test_time)) # print(str(classifier.findEnsembleTrainAcc(trainX, trainY))) if "Composite" in cls_name: second = "Para info too long!" else: second = str(classifier.get_params()) second.replace('\n', ' ') second.replace('\r', ' ') print(second) temp = np.array_repr(classifier.classes_).replace('\n', '') third = str(ac) + "," + str(build_time) + "," + str( test_time) + ",-1,-1," + str(len(classifier.classes_)) write_results_to_uea_format(second_line=second, third_line=third, output_path=results_path, classifier_name=cls_name, resample_seed=resampleID, predicted_class_vals=preds, actual_probas=probs, dataset_name=dataset, actual_class_vals=testY, split='TEST') if train_file: start = int(round(time.time() * 1000)) if build_test and hasattr( classifier, "_get_train_probs" ): #Normally Can only do this if test has been built ... well not necessarily true, but will do for now train_probs = classifier._get_train_probs(trainX) else: train_probs = cross_val_predict(classifier, X=trainX, y=trainY, cv=10, method='predict_proba') train_time = int(round(time.time() * 1000)) - start train_preds = classifier.classes_[np.argmax(train_probs, axis=1)] train_acc = accuracy_score(trainY, train_preds) print(cls_name + " on " + dataset + " resample number " + str(resampleID) + ' train acc: ' + str(train_acc) + ' time: ' + str(train_time)) if "Composite" in cls_name: second = "Para info too long!" else: second = str(classifier.get_params()) second.replace('\n', ' ') second.replace('\r', ' ') temp = np.array_repr(classifier.classes_).replace('\n', '') third = str(train_acc) + "," + str(train_time) + ",-1,-1,-1," + str( len(classifier.classes_)) write_results_to_uea_format(second_line=second, third_line=third, output_path=results_path, classifier_name=cls_name, resample_seed=resampleID, predicted_class_vals=train_preds, actual_probas=train_probs, dataset_name=dataset, actual_class_vals=trainY, split='TRAIN')
run_experiment(problem_path=data_dir, results_path=results_dir, cls_name=classifier, dataset=dataset, resampleID=resample, train_file=tf) else: #Local run # data_dir = "/scratch/univariate_datasets/" # results_dir = "/scratch/results" data_dir = "/bench/datasets/Univariate2018/" results_dir = "C:/Users/ajb/Dropbox/Turing Project/Results/" # data_dir = "Z:/ArchiveData/Univariate_ts/" # results_dir = "E:/Temp/" # results_dir = "Z:/Results/sktime Bakeoff/" dataset = "ItalyPowerDemand" trainX, trainY = load_ts(data_dir + dataset + '/' + dataset + '_TRAIN.ts') testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts') classifier = "TSF" resample = 1 # for i in range(0, len(univariate_datasets)): # dataset = univariate_datasets[i] # # print(i) # # print(" problem = "+dataset) tf = False run_experiment(overwrite=True, problem_path=data_dir, results_path=results_dir, cls_name=classifier, dataset=dataset, resampleID=resample, train_file=tf)
""" def predict(self, X, input_checks=True): if isinstance(X, pd.DataFrame): if X.shape[1] > 1: raise TypeError("ShapeDTW cannot handle multivariate problems yet") elif isinstance(X.iloc[0,0], pd.Series): X = np.asarray([a.values for a in X.iloc[:,0]]) else: raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe with a single column of Series objects (ShapeDTW cannot yet handle multivariate problems") n_samps, self.series_length = X.shape pass if __name__ == "__main__": testPath="C:\\Users\\Vince\\Documents\\Dissertation Repositories\\datasets\\Univariate2018_ts\\Chinatown\\Chinatown_TRAIN.ts" trainData,trainDataClass = load_ts(testPath) num_atts = trainData.shape[1] num_insts = trainData.shape[0] if isinstance(trainData, pd.DataFrame): if trainData.shape[1] > 1: raise TypeError("ShapeDTW cannot handle multivariate problems yet") elif isinstance(trainData.iloc[0,0], pd.Series): trainData = np.asarray([a.values for a in trainData.iloc[:,0]]) else: raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe with a single column of Series objects (ShapeDTW cannot yet handle multivariate problems") first = trainData[0,:] second = trainData[1,:]
results_path=results_dir, cls_name=classifier, dataset=dataset, resampleID=resample, train_file=tf, ) else: # Local run # data_dir = "/scratch/univariate_datasets/" # results_dir = "/scratch/results" # data_dir = "/bench/datasets/Univariate2018/" # results_dir = "C:/Users/ajb/Dropbox/Turing Project/Results/" data_dir = "Z:/ArchiveData/Univariate_ts/" results_dir = "E:/Temp/" # results_dir = "Z:/Results/sktime Bakeoff/" dataset = "GunPoint" trainX, trainY = load_ts(data_dir + dataset + "/" + dataset + "_TRAIN.ts") testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts") classifier = "TDE" resample = 0 # for i in range(0, len(univariate_datasets)): # dataset = univariate_datasets[i] # # print(i) # # print(" problem = "+dataset) tf = False run_experiment( overwrite=True, problem_path=data_dir, results_path=results_dir, cls_name=classifier, dataset=dataset, resampleID=resample,