def test_write_dataframe_to_ts_success(tmp_path, dataset): """Tests whether a dataset can be written by the .ts writer then read in.""" # load an example dataset path = os.path.join( os.path.dirname(sktime.__file__), f"datasets/data/{dataset}/{dataset}_TEST.ts", ) test_X, test_y = load_from_tsfile_to_dataframe(path) # output the dataframe in a ts file write_dataframe_to_tsfile( data=test_X, path=tmp_path, problem_name=dataset, class_label=np.unique(test_y), class_value_list=test_y, comment=""" The data was derived from twelve monthly electrical power demand time series from Italy and first used in the paper "Intelligent Icons: Integrating Lite-Weight Data Mining and Visualization into GUI Operating Systems". The classification task is to distinguish days from Oct to March (inclusive) from April to September. """, fold="_transform", ) # load data back from the ts file result = f"{tmp_path}/{dataset}/{dataset}_transform.ts" res_X, res_y = load_from_tsfile_to_dataframe(result) # check if the dataframes are the same assert_frame_equal(res_X, test_X)
def load(self): """Load dataset""" # load training and test set from separate files X_train, y_train = load_from_tsfile_to_dataframe( self._train_path, return_separate_X_and_y=True) X_test, y_test = load_from_tsfile_to_dataframe( self._test_path, return_separate_X_and_y=True) # combine into single dataframe data_train = pd.concat([X_train, pd.Series(y_train)], axis=1) data_test = pd.concat([X_test, pd.Series(y_test)], axis=1) # rename target variable data_train.rename(columns={data_train.columns[-1]: self._target_name}, inplace=True) data_test.rename(columns={data_test.columns[-1]: self._target_name}, inplace=True) # concatenate the two dataframes, keeping training and test split in # index, necessary for later optional CV data = pd.concat([data_train, data_test], axis=0, keys=["train", "test"]).reset_index(level=1, drop=True) return data
def _load_dataset(name, split, return_X_y, extract_path=None): """Load time series classification datasets (helper function).""" # Allow user to have non standard extract path if extract_path is not None: local_module = os.path.dirname(extract_path) local_dirname = extract_path else: local_module = MODULE local_dirname = DIRNAME if not os.path.exists(os.path.join(local_module, local_dirname)): os.makedirs(os.path.join(local_module, local_dirname)) if name not in _list_downloaded_datasets(extract_path): url = "http://timeseriesclassification.com/Downloads/%s.zip" % name # This also tests the validitiy of the URL, can't rely on the html # status code as it always returns 200 try: _download_and_extract( url, extract_path=extract_path, ) except zipfile.BadZipFile as e: raise ValueError( "Invalid dataset name. ", extract_path, "Please make sure the dataset " + "is available on http://timeseriesclassification.com/.", ) from e if isinstance(split, str): split = split.upper() if split in ("TRAIN", "TEST"): fname = name + "_" + split + ".ts" abspath = os.path.join(local_module, local_dirname, name, fname) X, y = load_from_tsfile_to_dataframe(abspath) # if split is None, load both train and test set elif split is None: X = pd.DataFrame(dtype="object") y = pd.Series(dtype="object") for split in ("TRAIN", "TEST"): fname = name + "_" + split + ".ts" abspath = os.path.join(local_module, local_dirname, name, fname) result = load_from_tsfile_to_dataframe(abspath) X = pd.concat([X, pd.DataFrame(result[0])]) y = pd.concat([y, pd.Series(result[1])]) y = pd.Series.to_numpy(y, dtype=np.str) else: raise ValueError("Invalid `split` value =", split) # Return appropriately if return_X_y: return X, y else: X["class_val"] = pd.Series(y) return X
def __init__(self, name, train=True): """ Datasets from the UEA time series archiv. Args: name: Name of the dataset. train: Return train split when True, test split when False. """ if name not in _list_downloaded_datasets(UEA_UCR_DATA_DIR): url = "http://timeseriesclassification.com/Downloads/%s.zip" % name # This also tests the validitiy of the URL, can't rely on the html # status code as it always returns 200 try: _download_and_extract(url, UEA_UCR_DATA_DIR) except zipfile.BadZipFile as e: raise ValueError( "Invalid dataset name. Please make sure the dataset is " "available on http://timeseriesclassification.com/." ) from e data_path = _build_UEA_UCR_data_path(name, train) self.data_x, self.data_y = load_from_tsfile_to_dataframe(data_path) # We do not support time series with time stamps yet. It seems as if # timestamps are stored in the index of the individual series. Thus # this check would fail if we don't have a regularly sampled time # series without time stamps. assert isinstance(self.data_x.iloc[0, 0].index, pd.RangeIndex) self.class_mapping = self.__build_class_mapping(name) self._n_classes = len(self.class_mapping.keys())
def __build_class_mapping(name): """ Build a class mapping mapping from class labels to ids of int type. Args: name: Dataset name Return: dict with dict[class_label] = class_id """ train_path = _build_UEA_UCR_data_path(name, True) _, train_y = load_from_tsfile_to_dataframe(train_path) unique_labels = np.unique(train_y) return dict(zip(unique_labels, range(len(unique_labels))))
def read_dataset(root_dir, dataset_name): datasets_dict = {} curr_root_dir = root_dir.replace('-temp', '') #For UCR root_dir_dataset = curr_root_dir + '/' + 'UCRArchive_2018' x_train, y_train = load_from_tsfile_to_dataframe(root_dir_dataset + '/' + dataset_name + '/' + dataset_name + '_TRAIN.ts') x_test, y_test = load_from_tsfile_to_dataframe(root_dir_dataset + '/' + dataset_name + '/' + dataset_name + '_TEST.ts') #x_train, y_train = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TRAIN.arff') #x_test, y_test = load_from_arff_to_dataframe(root_dir_dataset + '/'+ dataset_name + '/' + dataset_name + '_TEST.arff') #print(x_train) x_train = from_nested_to_2d_array(x_train, return_numpy=True) x_test = from_nested_to_2d_array(x_test, return_numpy=True) # znorm std_ = x_train.std(axis=1, keepdims=True) std_[std_ == 0] = 1.0 x_train = (x_train - x_train.mean(axis=1, keepdims=True)) / std_ std_ = x_test.std(axis=1, keepdims=True) std_[std_ == 0] = 1.0 x_test = (x_test - x_test.mean(axis=1, keepdims=True)) / std_ datasets_dict[dataset_name] = (x_train.copy(), y_train.copy(), x_test.copy(), y_test.copy()) return datasets_dict
def read_ts(filepath, **kwargs): """Read a ts file into Functional Data. Build a DenseFunctionalData or IrregularFunctionalData object upon a ts file passed as parameter. Notes ----- It is assumed that the data are unidimensional. And so, it will not be checked. Parameters ---------- filepath: str Any valid string path is acceptable. **kwargs: Keywords arguments to passed to the load_from_tsfile_to_dataframe function. Returns ------- obj: DenseFunctionalData or IrregularFunctionalData The loaded csv file. labels: np.ndarray Labels """ data, labels = load_from_tsfile_to_dataframe(filepath, **kwargs) len_argavals = data.applymap(len)['dim_0'].unique() if len(len_argavals) == 1: obj = read_ts_dense(data) else: obj = read_ts_irregular(data) return obj, labels
def test_load_from_tsfile_to_dataframe(): """Test the load_from_tsfile_to_dataframe() function.""" # Test that an empty file is classed an invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = "" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with an incomplete set of metadata is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata but no data is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ( "@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel false\n@data") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and no data but # invalid metadata values is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName\n@timeStamps\n@univariate " "true\n@classLabel false\n@data") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and a single # case/dimension parses correctly fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2)" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 1) np.testing.assert_equal(len(df.columns), 1) series = df["dim_0"] np.testing.assert_equal(len(series), 1) series = df["dim_0"][0] np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) finally: os.remove(path) # Test that a file with a complete set of metadata and 2 cases with 3 # dimensions parses correctly fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16) \n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 2) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 2) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_1"] np.testing.assert_equal(len(series), 2) series = df["dim_1"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 3.0) np.testing.assert_equal(series[1], 4.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_2"] np.testing.assert_equal(len(series), 2) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) finally: os.remove(path) # Test that a file with a complete set of metadata and time-series of # different length parses correctly fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3):(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15)\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 2) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 2) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_1"] np.testing.assert_equal(len(series), 2) series = df["dim_1"][0] np.testing.assert_equal(len(series), 1) np.testing.assert_equal(series[0], 3.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_2"] np.testing.assert_equal(len(series), 2) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 1) np.testing.assert_equal(series[0], 15.0) finally: os.remove(path) # Test that a file with a complete set of metadata and data but an # inconsistent number of dimensions across cases is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14) \n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data but missing # values after a tuple is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5),\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data and some # empty dimensions is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2): :(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14) : \n" file_contents += ( "(0, 21), (1, 22):(0, 23), (1,24) : (0,25), (1, 26) \n" ) tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 25.0) np.testing.assert_equal(series[1], 26.0) finally: os.remove(path) # Test that a file with a complete set of metadata and data that # contains datetimes as timestamps and has some empty dimensions is # classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += ("(01/01/2019 00:00:00, 1), (01/02/2019 " "00:00:00, 2) : " " : (01/05/2019 00:00:00, " "5), (01/06/2019 00:00:00, 6)\n") file_contents += ("(01/01/2020 00:00:00, 11), (01/02/2020 " "00:00:00, 12) : (01/03/2020 00:00:00, 13), " "(01/04/2020 00:00:00, 14) : \n") file_contents += ("(01/01/2021 00:00:00, 21), (01/02/2021 " "00:00:00, 22) : (01/03/2021 00:00:00, 23), " "(01/04/2021 00:00:00, 24) : \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/01/2019"], 1.0) np.testing.assert_equal(series["01/02/2019"], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/01/2020"], 11.0) np.testing.assert_equal(series["01/02/2020"], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/01/2021"], 21.0) np.testing.assert_equal(series["01/02/2021"], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/03/2020"], 13.0) np.testing.assert_equal(series["01/04/2020"], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/03/2021"], 23.0) np.testing.assert_equal(series["01/04/2021"], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/05/2019"], 5.0) np.testing.assert_equal(series["01/06/2019"], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 0) finally: os.remove(path) # Test that a file that mixes timestamp conventions is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += ("(01/01/2019 00:00:00, 1), (01/02/2019 " "00:00:00, 2) : " " : (01/05/2019 00:00:00, " "5), (01/06/2019 00:00:00, 6)\n") file_contents += ("(00, 11), (1, 12) : (01/03/2020 00:00:00, 13), " "(01/04/2020 00:00:00, 14) : \n") file_contents += ("(01/01/2021 00:00:00, 21), (01/02/2021 " "00:00:00, 22) : (01/03/2021 00:00:00, 23), " "(01/04/2021 00:00:00, 24) : \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data but missing # classes is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel true 0 1 " "2\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16) \n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data but invalid # classes is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel true 0 1 " "2\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6) : 0 \n" file_contents += ( "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16) : 3 \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data with classes # is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel true 0 1 " "2\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6): 0\n" file_contents += ( "(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, 16): 2 \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file df, y = load_from_tsfile_to_dataframe(path) # Test the DataFrame of X values returned accurately reflects # the data in the file np.testing.assert_equal(len(df), 2) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 2) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_1"] np.testing.assert_equal(len(series), 2) series = df["dim_1"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 3.0) np.testing.assert_equal(series[1], 4.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_2"] np.testing.assert_equal(len(series), 2) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) # Test that the class values are as expected np.testing.assert_equal(len(y), 2) np.testing.assert_equal(y[0], "0") np.testing.assert_equal(y[1], "2") finally: os.remove(path) # Test that a file with a complete set of metadata and data, with no # timestamps, is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "false\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "1,2:3,4:5,6\n" file_contents += "11,12:13,14:15,16\n" file_contents += "21,22:23,24:25,26\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 3.0) np.testing.assert_equal(series[1], 4.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 25.0) np.testing.assert_equal(series[1], 26.0) finally: os.remove(path) # Test that a file with a complete set of metadata and data, with no # timestamps and some empty dimensions, is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "false\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "1,2::5,6\n" file_contents += "11,12:13,14:15,16\n" file_contents += "21,22:23,24:\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 0) finally: os.remove(path) # Test that a file with a complete set of metadata and data, with no # timestamps and some empty dimensions and classes, is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "false\n@univariate true\n@classLabel true cat " "bear dog\n@data\n") file_contents += "1,2::5,6:cat \n" file_contents += "11,12:13,14:15,16: dog\n" file_contents += "21,22:23,24:: bear \n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df, y = load_from_tsfile_to_dataframe(path) # Test the DataFrame of X values returned accurately reflects # the data in the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 0) # Test that the class values are as expected np.testing.assert_equal(len(y), 3) np.testing.assert_equal(y[0], "cat") np.testing.assert_equal(y[1], "dog") np.testing.assert_equal(y[2], "bear") finally: os.remove(path)
def loadDataset(dataset): # * Data loads # dataset = "handwritting" if dataset == "motions": X, y = load_basic_motions(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) x_train = getValues(X_train) x_train = x_train.transpose([0,2,1]) x_test = getValues(X_test) x_test = x_test.transpose([0,2,1]) N, D = X_train.shape T = X_train.to_numpy()[0][0].to_numpy().shape[0] y_train = y_train.to_numpy() y_test = y_test.to_numpy() variables = ["accelerometer-x", "accelerometer-y", "accelerometer-z", "gyroscope-x", "gyroscope-y", "gyroscope-z"] else: if dataset == "wafer": X_train, y_train = load_from_tsfile_to_dataframe('datasets/Wafer/Wafer_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe('datasets/Wafer/Wafer_TRAIN.ts') variables = ["sensor"] elif dataset == "libras": X_train, y_train = load_from_tsfile_to_dataframe('datasets/Libras/Libras_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe('datasets/Libras/Libras_TEST.ts') variables = ["x", "y"] elif dataset == "uwave": X_train, y_train = load_from_tsfile_to_dataframe('datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe('datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TEST.ts') X_test = X_test[:1000] y_test = y_test[:1000] variables = ["accelerometer"] elif dataset == "stand": X_train, y_train = load_from_tsfile_to_dataframe('datasets/StandWalkJump/StandWalkJump_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe('datasets/StandWalkJump/StandWalkJump_TEST.ts') variables = ["ECG-1", "ECG-2", "ECG-3", "ECG-4"] elif dataset == "handwritting": X_train, y_train = load_from_tsfile_to_dataframe('datasets/Handwriting/Handwriting_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe('datasets/Handwriting/Handwriting_TEST.ts') variables = ["accelerometer-x", "accelerometer-y", "accelerometer-z"] N, D = X_train.shape N_te = X_test.shape[0] T = np.array(X_train.to_numpy()[0][0]).shape[0] x_train = np.zeros([N, D, T]) x_test = np.zeros([N_te, D, T]) for i in range(N): for j in range(D): x_train[i][j] = np.array(X_train.to_numpy()[i][j]) for i in range(N_te): for j in range(D): x_test[i][j] = np.array(X_test.to_numpy()[i][j]) # print(x_train.shape) x_train = x_train.transpose([0, 2, 1]) x_test = x_test.transpose([0, 2, 1]) # print(y_train) # x_test, y_test = load_from_tsfile_to_dataframe('datasets/Wafer/Wafer_TEST.ts') # * scale data x_train = x_train.transpose([2,0,1]) x_test = x_test.transpose([2,0,1]) for i in range(D): x_train[i], scaler = scaleSerie(x_train[i]) x_test[i], _ = scaleSerie(x_test[i], scaler) x_train = x_train.transpose([1,2,0]) x_test = x_test.transpose([1,2,0]) # * get data labels labels = np.unique(y_train) lb = preprocessing.LabelBinarizer() lb.fit(labels) y_train = lb.transform(y_train) y_test = lb.transform(y_test) if(len(labels) == 2): y_train = np.hstack((y_train, 1 - y_train)) y_test = np.hstack((y_test, 1 - y_test)) y_train_int = [labelInt(label) for label in y_train] y_test_int = [labelInt(label) for label in y_test] X = np.concatenate([x_train, x_test]) y = np.concatenate([y_train_int, y_test_int]) y = np.expand_dims(y, axis=1) return X, y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) x_train = getValues(X_train) x_train = x_train.transpose([0, 2, 1]) x_test = getValues(X_test) x_test = x_test.transpose([0, 2, 1]) N, D = X_train.shape T = X_train.to_numpy()[0][0].to_numpy().shape[0] y_train = y_train.to_numpy() y_test = y_test.to_numpy() N_te = X_test.shape[0] else: if args["dataset"] == "wafer": X_train, y_train = load_from_tsfile_to_dataframe( 'datasets/Wafer/Wafer_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe( 'datasets/Wafer/Wafer_TRAIN.ts') elif args["dataset"] == "libras": X_train, y_train = load_from_tsfile_to_dataframe( 'datasets/Libras/Libras_TRAIN.ts') X_test, y_test = load_from_tsfile_to_dataframe( 'datasets/Libras/Libras_TEST.ts') elif args["dataset"] == "uwave": X_train, y_train = load_from_tsfile_to_dataframe( 'datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TRAIN.ts' ) X_test, y_test = load_from_tsfile_to_dataframe( 'datasets/UWaveGestureLibraryAll/UWaveGestureLibraryAll_TEST.ts' ) elif args["dataset"] == "stand":
def _process_data(self): root = self.root data_loc = (root / "UEA" / "Multivariate_ts" / "CharacterTrajectories" / "CharacterTrajectories") train_X, train_y = load_from_tsfile_to_dataframe( str(data_loc) + "_TRAIN.ts") test_X, test_y = load_from_tsfile_to_dataframe( str(data_loc) + "_TEST.ts") train_X = train_X.to_numpy() test_X = test_X.to_numpy() X = np.concatenate((train_X, test_X), axis=0) y = np.concatenate((train_y, test_y), axis=0) lengths = torch.tensor([len(Xi[0]) for Xi in X]) # final_index = lengths - 1 maxlen = lengths.max() # Each channel is a pandas.core.series.Series object of length corresponding to the length of the time series X = torch.stack( [ torch.stack([pad(channel, maxlen) for channel in batch], dim=0) for batch in X ], dim=0, ) # Now fix the labels to be integers from 0 upwards targets = co.OrderedDict() counter = 0 for yi in y: if yi not in targets: targets[yi] = counter counter += 1 y = torch.tensor([targets[yi] for yi in y]) # If dropped is different than zero, randomly drop that quantity of data from the dataset. if self.dropped_rate != 0: generator = torch.Generator().manual_seed(56789) X_removed = [] for Xi in X: removed_points = (torch.randperm( X.shape[-1], generator=generator)[:int(X.shape[-1] * float(self.dropped_rate) / 100.0)].sort().values) Xi_removed = Xi.clone() Xi_removed[:, removed_points] = float("nan") X_removed.append(Xi_removed) X = torch.stack(X_removed, dim=0) # Normalize data X = normalise_data(X, y) # Once the data is normalized append times and mask values if required. if self.dropped_rate != 0: # Get mask of possitions that are deleted (Only first channel required # as all channels eliminated synchronously). mask_exists = (~torch.isnan(X[:, :1, :])).float() X = torch.where(~torch.isnan(X), X, torch.Tensor([0.0])) X = torch.cat([X, mask_exists], dim=1) train_X, val_X, test_X = split_data(X, y) train_y, val_y, test_y = split_data(y, y) return ( train_X, val_X, test_X, train_y, val_y, test_y, )
def create_subsample(input_dir, UCR_list, output_dir): for db_name_ite in UCR_list.values: db_name = db_name_ite[0] train_x, train_y = load_from_tsfile_to_dataframe( "%s/%s/%s_TRAIN.ts" % (input_dir, db_name, db_name)) test_x, test_y = load_from_tsfile_to_dataframe( "%s/%s/%s_TEST.ts" % (input_dir, db_name, db_name)) data = np.zeros((len(train_y) + len(test_y), len(train_x.iloc[1, 0]))) for i in range(0, len(train_y)): data[i, :] = train_x.iloc[i, :][0] k = 0 for i in range(len(train_y), len(train_y) + len(test_y)): data[i, :] = test_x.iloc[k, :][0] k = k + 1 classes = np.concatenate((train_y, test_y)) classes = classes.astype(int) l = data.shape[0] if l < 100: subratio = 0.8 elif l < 300: subratio = 0.6 elif l < 800: subratio = 0.4 elif l < 1500: subratio = 0.2 elif l < 5000: subratio = 0.1 else: subratio = 0.05 while l * subratio / len(np.unique(classes)) < 10: subratio = subratio + 0.1 if subratio > 0.8: subratio = 0.8 s = StratifiedShuffleSplit(test_size=subratio / 2, train_size=subratio / 2) train_index, test_index = next(s.split(data, classes)) data_df = np.concatenate((data[train_index, :], data[test_index, :])) classes_df = np.concatenate( (classes[train_index], classes[test_index])) data_df = np.column_stack((data_df, classes_df)) df = pd.DataFrame(data_df) attributes = [(c.astype(str), 'NUMERIC') for c in df.columns.values[:-1]] t = df.columns[-1] attributes += [('target', df[t].unique().astype(str).tolist())] data = [ df.loc[i].values[:-1].tolist() + [df[t].loc[i]] for i in range(df.shape[0]) ] arff_dic = { 'attributes': attributes, 'data': data, 'relation': db_name, 'description': '' } if not os.path.exists("%s/%s" % (output_dir, db_name)): os.makedirs("%s/%s" % (output_dir, db_name)) with open("%s/%s/%s.arff" % (output_dir, db_name, db_name), "w", encoding="utf8") as f: arff.dump(arff_dic, f) print("%s created" % db_name) print("Subsample finished!") return
nbDatasets = 0 for dataset in os.listdir(DATA_PATH): nbDatasets+=1 print("Evaluating on %s cores %d classifiers on %d datasets with a %d fold cross-validation..." % ("all" if nb_jobs == -1 else str(nb_jobs), len(classifiers), nbDatasets, nb_split)) start_global_time = time.perf_counter() # Evaluates all classifiers using cross validation per dataset for dataset in os.listdir(DATA_PATH): # Loads dataset for cross validation print("\nLoading %s dataset..." % dataset) start_load_time = time.perf_counter() filepath = dataset+"/"+dataset+"_" # dataset/dataset_TEST.ts and dataset/dataset_TRAIN.ts # Load train data + class d, c = load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, filepath+"TRAIN.ts")) # Load test data + class dd, cc = load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, filepath+"TEST.ts")) # Store all data and all class (concatenate train and test) data, classes = d.append(dd), np.concatenate((c, cc)) elapsed_load_time = time.perf_counter() - start_load_time print("Loading took: %f seconds" % elapsed_load_time) # Now we will do all cross-validations on this dataset for classifier, classifier_name in classifiers: print("Classifier: "+classifier_name) start_time = time.perf_counter() # cross-validation scores = cross_val_score(classifier, data, classes, cv=cv, n_jobs=nb_jobs)