def sktime_to_numpy(train_file, test_file): X_train_ts, y_train = load_from_tsfile_to_dataframe(train_file) X_test_ts, y_test = load_from_tsfile_to_dataframe(test_file) train_max_len = max([len(X_train_ts.iloc[i]['dim_0']) for i in range(len(X_train_ts))]) test_max_len = max([len(X_test_ts.iloc[i]['dim_0']) for i in range(len(X_test_ts))]) max_len = max(train_max_len, test_max_len) X_train = np.zeros((len(X_train_ts), len(X_train_ts.columns), max_len)) X_test = np.zeros((len(X_test_ts), len(X_test_ts.columns), max_len)) for i in range(len(X_train_ts)): for col_idx, col in enumerate(X_train_ts.columns): X_train[i, col_idx] = np.pad(X_train_ts.iloc[i][col].values, pad_width=(0,max_len-len(X_train_ts.iloc[i][col].values)), mode='constant') for i in range(len(X_test_ts)): for col_idx, col in enumerate(X_test_ts.columns): X_test[i, col_idx] = np.pad(X_test_ts.iloc[i][col].values, pad_width=(0,max_len-len(X_test_ts.iloc[i][col].values)), mode='constant') return np.transpose(X_train, (0, 2, 1)), pd.Categorical(pd.Series(y_train)).codes,\ np.transpose(X_test, (0, 2, 1)), pd.Categorical(pd.Series(y_test)).codes
def load_data(name, benchmark=True, univariate=True, sep=',', test_size=0.2): if benchmark and univariate: path = u_path elif benchmark and not univariate: path = m_path else: path = './datasets/realworld/' if benchmark: train_x, train_y = load_from_tsfile_to_dataframe(path + name + '/' + name + '_TRAIN.ts') test_x, test_y = load_from_tsfile_to_dataframe(path + name + '/' + name + '_TEST.ts') return train_x, train_y, test_x, test_y else: data = pd.read_csv(path + name + '/data.csv', sep=sep) return data
def pre_process(self, test_data=False): # if test_data == True: train_x, train_y = load_from_tsfile_to_dataframe(os.path.join(self.data_dir, self.file_name_test)) elif test_data == False: train_x, train_y = load_from_tsfile_to_dataframe(os.path.join(self.data_dir, self.file_name_train)) # train_x is samples * k features # expand train_x to be samples * (kn), where n is the number of temporal dimension concatenator = ColumnConcatenator() conc_fit = concatenator.fit(train_x) train_x_trans = conc_fit.transform(train_x) # for i in range(0, train_x_trans.shape[0]): row = train_x_trans.iloc[i, 0] row_df = row.to_frame() if i == 0: temp = row_df else: temp = pd.concat([temp, row_df], axis=1) # temp_t is samples * nk df train_x_expanded = temp.transpose() # convert y to binary label train_y_binary = self.convert_y_to_binary_label(train_y) # usually the original data has a balanced ratio, in that case, we will downsample the minority if test_data == False and self.down_sample_minority == True: train_x_to_be_downsample = train_x_expanded[train_y_binary == 1] # calculate the number of samples to keep for minority (we keep 1/3) sample_size = round(train_x_to_be_downsample.shape[0] / self.minority_div) # downsample minority train_x_downsampled = train_x_to_be_downsample.iloc[0:sample_size] # concat x_minoriy and x_majority train_x_maj = train_x_expanded[train_y_binary == 0] train_x_all = pd.concat([train_x_downsampled, train_x_maj], axis=0) # get y_labels after downsample train_y_binary_minority = np.ones(sample_size) train_y_binary_majority = np.zeros(train_y_binary[train_y_binary == 0].shape[0]) # concat y_ones and y_zeros train_y_binary = np.concatenate((train_y_binary_minority, train_y_binary_majority), axis=0) # set train_x_all = train_x_expanded train_x_expanded = train_x_all return train_x_expanded, train_y_binary
def evaluate_classifiers(dst): print("[%s] Processing dataset %s" % (datetime.now().strftime("%F %T"), dst)) train_x, train_y = load_from_tsfile_to_dataframe( os.path.join(UCR_DATASET_PATH, dst, dst + "_TRAIN.ts")) test_x, test_y = load_from_tsfile_to_dataframe( os.path.join(UCR_DATASET_PATH, dst, dst + "_TEST.ts")) def evaluate_classifier(clf): clf.fit(train_x, train_y) pred = clf.predict(test_x) return accuracy_score(test_y, pred), f1_score(test_y, pred, average='macro') return list( itertools.chain( *[evaluate_classifier(clf) for _, clf in classifiers])), dst
def agent(path="./", dataset="" ,ratio =False,seg = 0.75, folder="temp"): current_process().name = dataset start1 = time.time() train_x, train_y = load_from_tsfile_to_dataframe(f"{path}/{dataset}/{dataset}_TRAIN.ts") test_x, test_y = load_from_tsfile_to_dataframe(f"{path}/{dataset}/{dataset}_TEST.ts") print(f"{dataset}: Train Shape {train_x.shape}") print(f"{dataset}: Test Shape {test_x.shape}") scaler = StandardScaler() transform_time1 = time.time() mod_train = PAAStat(paa_=ratio, seg_=seg).transform(train_x.values) mod_train = scaler.fit(mod_train).transform(mod_train) mod_test = PAAStat(paa_=ratio, seg_=seg).transform(test_x.values) mod_test = scaler.transform(mod_test) transform_time2 = time.time() model = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True) train_time1 = time.time() model.fit(mod_train, train_y) preds = model.predict(mod_test) train_time2 = time.time() acc1 = accuracy_score(preds, test_y) * 100 end1 = time.time() print(f"Dataset: {dataset}, AccuracyRidge: {acc1}, Time taken: {(end1 - start1)/60}, " f"Transfrom_time: {(transform_time2-transform_time1)/60}, train_time: {(train_time2-train_time1)/60}") results = pd.DataFrame({'Dataset': dataset, 'AccuracyRidge': [acc1], 'Time (min)': [(end1 - start1)/60], 'Transfrom_time (min)': [(transform_time2-transform_time1)/60], 'train_time (min)': [(train_time2-train_time1)/60]}) temp_path = './'+folder if not os.path.exists(temp_path): os.mkdir(temp_path) results.to_csv(os.path.join(temp_path + f'/{dataset}.csv'), index=False)
def sktime_to_numpy(file): X_ts, y = load_from_tsfile_to_dataframe(file) max_len = global_vars.get('input_height') X = np.zeros((len(X_ts), len(X_ts.columns), max_len)) for i in range(len(X_ts)): for col_idx, col in enumerate(X_ts.columns): X[i, col_idx] = np.pad(X_ts.iloc[i][col].values, pad_width=(0, max_len - len(X_ts.iloc[i][col].values)), mode='constant') return X, pd.Categorical(pd.Series(y)).codes
def evaluate_classifiers(dst): print("[%s] Processing dataset %s" % (datetime.now().strftime("%F %T"), dst)) if dst in excluded: return None train_x, train_y = load_from_tsfile_to_dataframe(os.path.join(UCR_DATASET_PATH, dst, dst + "_TRAIN.ts")) test_x, test_y = load_from_tsfile_to_dataframe(os.path.join(UCR_DATASET_PATH, dst, dst + "_TEST.ts")) try: transform = ContractedShapeletTransform(time_limit_in_mins=5) train_data = transform.fit_transform(train_x, train_y) test_data = transform.transform(test_x) except: return None def evaluate_classifier(clf): try: clf.fit(train_data, train_y) pred = clf.predict(test_data) return accuracy_score(test_y, pred), f1_score(test_y, pred, average='macro') except: return float('nan'), float('nan') return list(itertools.chain(*[evaluate_classifier(clf) for clf in classifiers])), dst
def evaluate_classifiers(dst): print("[%s] Processing dataset %s" % (datetime.now().strftime("%F %T"), dst)) if dst in excluded: return None train_x, train_y = load_from_tsfile_to_dataframe( os.path.join(UCR_DATASET_PATH, dst, dst + "_TRAIN.ts")) test_x, test_y = load_from_tsfile_to_dataframe( os.path.join(UCR_DATASET_PATH, dst, dst + "_TEST.ts")) if len(set(train_y)) == 1: return None data_train = data_test = None try: data_train = func(train_x) data_test = func(test_x) except Exception as e: print("Exception while evaluating classifier:", e.__str__()) return None def evaluate_classifier(clf): try: clf.fit(data_train, train_y) pred = clf.predict(data_test) return accuracy_score(test_y, pred), f1_score(test_y, pred, average='macro') except Exception as e: print("Exception while evaluating classifier:", e.__str__()) return float('nan'), float('nan') results = list( itertools.chain(*[evaluate_classifier(clf) for clf in classifiers])) del train_x, train_y, test_x, test_y, data_train, data_test gc.collect() return results, dst
def __build_class_mapping(name, kind): """Build a class mapping mapping from class labels to ids of int type. Args: name: Dataset name Return: dict with dict[class_label] = class_id """ train_path = _build_UEA_UCR_data_path(name, kind, True) # test_path = build_UEA_UCR_data_path(name, False) _, train_y = load_from_tsfile_to_dataframe(train_path) # _, test_y = load_from_tsfile_to_dataframe(test_path) # all_labels = np.concatenate([train_y, test_y], axis=0) unique_labels = np.unique(train_y) return dict(zip(unique_labels, range(len(unique_labels))))
def load_gunpoint_dataframe(split='TRAIN', return_X_y=False): """Loads the GunPoint time series classification problem and returns X and y Dimensionality: univariate Series length: 150 Train cases: 50 Test cases: 150 Number of classes: 2 This dataset involves one female actor and one male actor making a motion with their hand. The two classes are: Gun-Draw and Point: For Gun-Draw the actors have their hands by their sides. They draw a replicate gun from a hip-mounted holster, point it at a target for approximately one second, then return the gun to the holster, and their hands to their sides. For Point the actors have their gun by their sides. They point with their index fingers to a target for approximately one second, and then return their hands to their sides. For both classes, we tracked the centroid of the actor's right hands in both X- and Y-axes, which appear to be highly correlated. The data in the archive is just the X-axis. Dataset details: http://timeseriesclassification.com/description.php?Dataset=GunPoint Parameters ---------- split: string (either "TRAIN" or "TEST", default = 'TRAIN') Whether to load the default train or test partition of the problem Returns ------- X: pandas DataFrame with m rows and c columns The time series data for the problem with m cases and c dimensions y: numpy array The class labels for each case in X """ module_path = path.dirname(__file__) dname = 'data' pname = 'GunPoint' fname = pname+'_'+split+'.ts' abspath = path.join(module_path, dname, pname, fname) X, y = load_from_tsfile_to_dataframe(abspath) if return_X_y: X['class_val'] = pd.Series(y) return X else: return X, y
def fit(self, luck_average_windows, assessment_windows, until=None, max_horizon=9 * 6): logger("MODEL-FIT").debug( "max_horizon: {} / avg windows: {} / assmnt windows: {} / until: {} / total_data_size: {}".format( max_horizon, str(luck_average_windows), str(assessment_windows), until, len(self.data_points))) if until is not None and (until < 0 or until >= len(self.data_points)): logger("MODEL-FIT").error("Parameter until is too large for the given data points: {}".format(until)) return self.horizon = max_horizon for wi, w in enumerate(assessment_windows): if w > self.horizon: break # prepare data frame for sktime package temporary_data_fit_file = self.prepare_ts_file(0, len(self.data_points) if until is None else until, self.case_observation_size, wi, w) # parse data frames from the temporary fit data file X, y = load_from_tsfile_to_dataframe(temporary_data_fit_file, replace_missing_vals_with="-100") # which label is the first one? true_index = 0 if y[0] == "false": true_index = 1 new_class_weights = self.create_class_weight_dict(true_index=true_index) estimators = [] for i in range(0, len(luck_average_windows)): estimators.append(("TSF{}".format(i), TimeSeriesForestClassifier( n_estimators=int(self.no_estimators), n_jobs=16, max_depth=self.max_depth, class_weight=new_class_weights, criterion=self.criterion, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, oob_score=self.oob_score, bootstrap=self.bootstrap), [i])) c = ColumnEnsembleClassifier(estimators=estimators) c.fit(X, y) # print(str(c.classes_)) self.classifiers.append(c)
def load_arrow_head_dataframe(split='TRAIN', return_X_y=False): """Loads the ArrowHead time series classification problem and returns X and y Dimensionality: univariate Series length: 251 Train cases: 36 Test cases: 175 Number of classes: 3 The arrowhead data consists of outlines of the images of arrowheads. The shapes of the projectile points are converted into a time series using the angle-based method. The classification of projectile points is an important topic in anthropology. The classes are based on shape distinctions such as the presence and location of a notch in the arrow. The problem in the repository is a length normalised version of that used in Ye09shapelets. The three classes are called "Avonlea", "Clovis" and "Mix"." Dataset details: http://timeseriesclassification.com/description.php?Dataset=ArrowHead Parameters ---------- split: string (either "TRAIN" or "TEST", default = 'TRAIN') Whether to load the default train or test partition of the problem Returns ------- X: pandas DataFrame with m rows and c columns The time series data for the problem with m cases and c dimensions y: numpy array The class labels for each case in X """ module_path = path.dirname(__file__) dname = 'data' pname = 'ArrowHead' fname = pname+'_'+split+'.ts' abspath = path.join(module_path, dname, pname, fname) X, y = load_from_tsfile_to_dataframe(abspath) if return_X_y: X['class_val'] = pd.Series(y) return X else: return X, y
def __init__(self, name, kind, train=True): """Datasets from the UCR time series archiv. Args: name: Name of the dataset. kind: Kind of dataset. univariate or multivariate train: Return train split when True, test split when False. """ data_path = _build_UEA_UCR_data_path(name, kind, train) self.data_x, self.data_y = load_from_tsfile_to_dataframe(data_path) # We do not support time series with time stamps yet. It seems as if # timestamps are stored in the index of the individual series. Thus # this check would fail if we don't have a regularly sampled time # series without time stamps. assert isinstance(self.data_x.iloc[0, 0].index, pd.RangeIndex) self.class_mapping = self.__build_class_mapping(name, kind) self._n_classes = len(self.class_mapping.keys())
def load_italy_power_demand_dataframe(split='TRAIN', return_X_y=False): """Loads the ItalyPowerDemand time series classification problem and returns X and y Dimensionality: univariate Series length: 24 Train cases: 67 Test cases: 1029 Number of classes: 2 The data was derived from twelve monthly electrical power demand time series from Italy and first used in the paper "Intelligent Icons: Integrating Lite-Weight Data Mining and Visualization into GUI Operating Systems". The classification task is to distinguish days from Oct to March (inclusive) from April to September. Dataset details: http://timeseriesclassification.com/description.php?Dataset=ItalyPowerDemand Parameters ---------- split: string (either "TRAIN" or "TEST", default = 'TRAIN') Whether to load the default train or test partition of the problem Returns ------- X: pandas DataFrame with m rows and c columns The time series data for the problem with m cases and c dimensions y: numpy array The class labels for each case in X """ module_path = path.dirname(__file__) dname = 'data' pname = 'ItalyPowerDemand' fname = pname+'_'+split+'.ts' abspath = path.join(module_path, dname, pname, fname) X, y = load_from_tsfile_to_dataframe(abspath) if return_X_y: X['class_val'] = pd.Series(y) return X else: return X, y
def read_ts(filepath, **kwargs): """Read a ts file into Functional Data. Build a DenseFunctionalData or IrregularFunctionalData object upon a ts file passed as parameter. Notes ----- It is assumed that the data are unidimensional. And so, it will not be checked. Parameters ---------- filepath: str Any valid string path is acceptable. **kwargs: Keywords arguments to passed to the load_from_tsfile_to_dataframe function. Returns ------- obj: DenseFunctionalData or IrregularFunctionalData The loaded csv file. labels: np.ndarray Labels """ data, labels = load_from_tsfile_to_dataframe(filepath, **kwargs) len_argavals = data.applymap(len)['dim_0'].unique() if len(len_argavals) == 1: obj = read_ts_dense(data) else: obj = read_ts_irregular(data) return obj, labels
KNeighborsTimeSeriesClassifier(1, 'uniform', 'brute', 'dtw', None), "DTW-1NN" ], [ KNeighborsTimeSeriesClassifier(4, 'uniform', 'brute', 'dtw', None), "DTW-4NN" ]] # --------------- MAIN PROGRAM --------------------------------- # Load data # [ ((train_data, train_class), (test_data, test_class), dataset name), ...] data = [] for train_path, test_path, name in datasets_path: data += [ (load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, train_path)), load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, test_path)), name) ] for classifier, classifier_name in classifiers: print("|---" + classifier_name) for ((train_data, train_class), (test_data, test_class), name) in data: # Training classifier.fit(train_data, train_class) # Predicting class prediction = classifier.predict(test_data) # Computing accuracy accuracy = accuracy_score(test_class, prediction)
def get_UCR_data(dsid, path='.', parent_dir='data/UCR', verbose=False, on_disk=True, return_split=True): if verbose: print('Dataset:', dsid) assert dsid in get_UCR_univariate_list() + get_UCR_multivariate_list( ), f'{dsid} is not a UCR dataset' full_parent_dir = Path(path) / parent_dir full_tgt_dir = full_parent_dir / dsid if not all([ os.path.isfile(f'{full_parent_dir}/{dsid}/{fn}.npy') for fn in ['X_train', 'X_valid', 'y_train', 'y_valid', 'X', 'y'] ]): if dsid in ['InsectWingbeat', 'DuckDuckGeese']: if verbose: print( 'There are problems with the original zip file and data cannot correctly downloaded' ) return None, None, None, None src_website = 'http://www.timeseriesclassification.com/Downloads' if verbose: print(f'Downloading and decompressing data to {full_tgt_dir}...') decompress_from_url(f'{src_website}/{dsid}.zip', target_dir=full_tgt_dir, verbose=verbose) if verbose: print('...data downloaded and decompressed') X_train_df, y_train = load_from_tsfile_to_dataframe(full_tgt_dir / f'{dsid}_TRAIN.ts') X_valid_df, y_valid = load_from_tsfile_to_dataframe(full_tgt_dir / f'{dsid}_TEST.ts') X_train_ = [] X_valid_ = [] for i in range(X_train_df.shape[-1]): X_train_.append( stack_pad(X_train_df[f'dim_{i}']) ) # stack arrays even if they have different lengths X_valid_.append( stack_pad(X_valid_df[f'dim_{i}']) ) # stack arrays even if they have different lengths X_train = np.transpose(np.stack(X_train_, axis=-1), (0, 2, 1)).astype(np.float32) X_valid = np.transpose(np.stack(X_valid_, axis=-1), (0, 2, 1)).astype(np.float32) np.save(f'{full_tgt_dir}/X_train.npy', X_train) np.save(f'{full_tgt_dir}/y_train.npy', y_train) np.save(f'{full_tgt_dir}/X_valid.npy', X_valid) np.save(f'{full_tgt_dir}/y_valid.npy', y_valid) np.save(f'{full_tgt_dir}/X.npy', concat(X_train, X_valid)) np.save(f'{full_tgt_dir}/y.npy', concat(y_train, y_valid)) del X_train, X_valid, y_train, y_valid delete_all_in_dir(full_tgt_dir, exception='.npy') mmap_mode = 'r+' if on_disk else None X_train = np.load(f'{full_tgt_dir}/X_train.npy', mmap_mode=mmap_mode) y_train = np.load(f'{full_tgt_dir}/y_train.npy', mmap_mode=mmap_mode) X_valid = np.load(f'{full_tgt_dir}/X_valid.npy', mmap_mode=mmap_mode) y_valid = np.load(f'{full_tgt_dir}/y_valid.npy', mmap_mode=mmap_mode) if return_split: if verbose: print('X_train:', X_train.shape) print('y_train:', y_train.shape) print('X_valid:', X_valid.shape) print('y_valid:', y_valid.shape, '\n') return X_train, y_train, X_valid, y_valid else: X = np.load(f'{full_tgt_dir}/X.npy', mmap_mode=mmap_mode) y = np.load(f'{full_tgt_dir}/y.npy', mmap_mode=mmap_mode) splits = get_predefined_splits(*[X_train, X_valid]) if verbose: print('X :', X.shape) print('y :', y.shape) print('splits :', splits, '\n') return X, y, splits
def test_load_from_tsfile_to_dataframe(): """Test the load_from_tsfile_to_dataframe() function.""" # Test that an empty file is classed an invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = "" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with an incomplete set of metadata is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata but no data is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ( "@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel false\n@data") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and no data but # invalid metadata values is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName\n@timeStamps\n@univariate " "true\n@classLabel false\n@data") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and a single # case/dimension parses correctly fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2)" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 1) np.testing.assert_equal(len(df.columns), 1) series = df["dim_0"] np.testing.assert_equal(len(series), 1) series = df["dim_0"][0] np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) finally: os.remove(path) # Test that a file with a complete set of metadata and 2 cases with 3 # dimensions parses correctly fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n" file_contents += ("(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, " "16) \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 2) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 2) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_1"] np.testing.assert_equal(len(series), 2) series = df["dim_1"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 3.0) np.testing.assert_equal(series[1], 4.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_2"] np.testing.assert_equal(len(series), 2) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) finally: os.remove(path) # Test that a file with a complete set of metadata and time-series of # different length parses correctly fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3):(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14):(0, 15)\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 2) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 2) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_1"] np.testing.assert_equal(len(series), 2) series = df["dim_1"][0] np.testing.assert_equal(len(series), 1) np.testing.assert_equal(series[0], 3.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_2"] np.testing.assert_equal(len(series), 2) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 1) np.testing.assert_equal(series[0], 15.0) finally: os.remove(path) # Test that a file with a complete set of metadata and data but an # inconsistent number of dimensions across cases is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14) \n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data but missing # values after a tuple is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5),\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data and some # empty dimensions is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "(0, 1), (1, 2): :(0, 5), (1, 6)\n" file_contents += "(0, 11), (1, 12):(0, 13), (1,14) : \n" file_contents += ("(0, 21), (1, 22):(0, 23), (1,24) : (0," "25), (1, 26) \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 25.0) np.testing.assert_equal(series[1], 26.0) finally: os.remove(path) # Test that a file with a complete set of metadata and data that # contains datetimes as timestamps and has some empty dimensions is # classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += ("(01/01/2019 00:00:00, 1), (01/02/2019 " "00:00:00, 2) : " " : (01/05/2019 00:00:00, " "5), (01/06/2019 00:00:00, 6)\n") file_contents += ("(01/01/2020 00:00:00, 11), (01/02/2020 " "00:00:00, 12) : (01/03/2020 00:00:00, 13), " "(01/04/2020 00:00:00, 14) : \n") file_contents += ("(01/01/2021 00:00:00, 21), (01/02/2021 " "00:00:00, 22) : (01/03/2021 00:00:00, 23), " "(01/04/2021 00:00:00, 24) : \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/01/2019"], 1.0) np.testing.assert_equal(series["01/02/2019"], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/01/2020"], 11.0) np.testing.assert_equal(series["01/02/2020"], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/01/2021"], 21.0) np.testing.assert_equal(series["01/02/2021"], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/03/2020"], 13.0) np.testing.assert_equal(series["01/04/2020"], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/03/2021"], 23.0) np.testing.assert_equal(series["01/04/2021"], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series["01/05/2019"], 5.0) np.testing.assert_equal(series["01/06/2019"], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 0) finally: os.remove(path) # Test that a file that mixes timestamp conventions is invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel " "false\n@data\n") file_contents += ("(01/01/2019 00:00:00, 1), (01/02/2019 " "00:00:00, 2) : " " : (01/05/2019 00:00:00, " "5), (01/06/2019 00:00:00, 6)\n") file_contents += ("(00, 11), (1, 12) : (01/03/2020 00:00:00, 13), " "(01/04/2020 00:00:00, 14) : \n") file_contents += ("(01/01/2021 00:00:00, 21), (01/02/2021 " "00:00:00, 22) : (01/03/2021 00:00:00, 23), " "(01/04/2021 00:00:00, 24) : \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data but missing # classes is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel true 0 1 " "2\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, 6)\n" file_contents += ("(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, " "16) \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data but invalid # classes is classed as invalid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel true 0 1 " "2\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, " "6) : 0 \n" file_contents += ("(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, " "16) : 3 \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file and assert that it is invalid np.testing.assert_raises(TsFileParseException, load_from_tsfile_to_dataframe, path) finally: os.remove(path) # Test that a file with a complete set of metadata and data with classes # is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "true\n@univariate true\n@classLabel true 0 1 " "2\n@data\n") file_contents += "(0, 1), (1, 2):(0, 3), (1, 4):(0, 5), (1, " "6): 0\n" file_contents += ("(0, 11), (1, 12):(0, 13), (1,14):(0, 15), (1, " "16): 2 \n") tmp_file.write(file_contents) tmp_file.flush() # Parse the file df, y = load_from_tsfile_to_dataframe(path) # Test the DataFrame of X values returned accurately reflects # the data in the file np.testing.assert_equal(len(df), 2) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 2) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_1"] np.testing.assert_equal(len(series), 2) series = df["dim_1"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 3.0) np.testing.assert_equal(series[1], 4.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_2"] np.testing.assert_equal(len(series), 2) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) # Test that the class values are as expected np.testing.assert_equal(len(y), 2) np.testing.assert_equal(y[0], "0") np.testing.assert_equal(y[1], "2") finally: os.remove(path) # Test that a file with a complete set of metadata and data, with no # timestamps, is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "false\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "1,2:3,4:5,6\n" file_contents += "11,12:13,14:15,16\n" file_contents += "21,22:23,24:25,26\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 3.0) np.testing.assert_equal(series[1], 4.0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 25.0) np.testing.assert_equal(series[1], 26.0) finally: os.remove(path) # Test that a file with a complete set of metadata and data, with no # timestamps and some empty dimensions, is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "false\n@univariate true\n@classLabel " "false\n@data\n") file_contents += "1,2::5,6\n" file_contents += "11,12:13,14:15,16\n" file_contents += "21,22:23,24:\n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df = load_from_tsfile_to_dataframe(path) # Test the DataFrame returned accurately reflects the data in # the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 0) finally: os.remove(path) # Test that a file with a complete set of metadata and data, with no # timestamps and some empty dimensions and classes, is classed as valid fd, path = tempfile.mkstemp() try: with os.fdopen(fd, "w") as tmp_file: # Write the contents of the file file_contents = ("@problemName Test Problem\n@timeStamps " "false\n@univariate true\n@classLabel true cat " "bear dog\n@data\n") file_contents += "1,2::5,6:cat \n" file_contents += "11,12:13,14:15,16: dog\n" file_contents += "21,22:23,24:: bear \n" tmp_file.write(file_contents) tmp_file.flush() # Parse the file df, y = load_from_tsfile_to_dataframe(path) # Test the DataFrame of X values returned accurately reflects # the data in the file np.testing.assert_equal(len(df), 3) np.testing.assert_equal(len(df.columns), 3) series = df["dim_0"] np.testing.assert_equal(len(series), 3) series = df["dim_0"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 1.0) np.testing.assert_equal(series[1], 2.0) series = df["dim_0"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 11.0) np.testing.assert_equal(series[1], 12.0) series = df["dim_0"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 21.0) np.testing.assert_equal(series[1], 22.0) series = df["dim_1"] np.testing.assert_equal(len(series), 3) series = df["dim_1"][0] np.testing.assert_equal(len(series), 0) series = df["dim_1"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 13.0) np.testing.assert_equal(series[1], 14.0) series = df["dim_1"][2] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 23.0) np.testing.assert_equal(series[1], 24.0) series = df["dim_2"] np.testing.assert_equal(len(series), 3) series = df["dim_2"][0] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 5.0) np.testing.assert_equal(series[1], 6.0) series = df["dim_2"][1] np.testing.assert_equal(len(series), 2) np.testing.assert_equal(series[0], 15.0) np.testing.assert_equal(series[1], 16.0) series = df["dim_2"][2] np.testing.assert_equal(len(series), 0) # Test that the class values are as expected np.testing.assert_equal(len(y), 3) np.testing.assert_equal(y[0], "cat") np.testing.assert_equal(y[1], "dog") np.testing.assert_equal(y[2], "bear") finally: os.remove(path)
if eval(self.paa): paas_ = [] for seg in self.segs_: s = int((dim.shape[0]) * seg) if s < 1: continue #print(f"Compression: {seg}") paa_per_seg = PiecewiseAggregateApproximation(n_segments=s)\ .fit_transform(dim).flatten() paas_.extend(extract_stats(paa_per_seg)) temp.extend(paas_) else: temp.extend(extract_stats(dim)) x_new.append(temp) x_new = np.asarray(x_new) imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_mean.fit(x_new) x_new = imp_mean.transform(x_new) return np.asarray(x_new) if __name__ == '__main__': paa = PAAStat(paa_='True', seg_=0.75) train_x, train_y = load_from_tsfile_to_dataframe( "../mtsc/data/LSST/LSST_TRAIN.ts") s = paa.transform(train_x.values) print(s.shape)
def getData(self,set_type): ################### ## Data Generation ################### if set_type == "generator": #Settings windowLength = 1 samplesPerWindow = 100 n_classes = 6 n_windows = 1000 Gen = ImbalancedDataGenerator(n_samples= samplesPerWindow,resolution = windowLength/samplesPerWindow,SNR_dB = 50, variation = 1, n_classes=n_classes, useseed = False, seed = 5) windows_pool, windows_test, y_pool, y_test = Gen.GeneratePool(n_windows) class_names = Gen.class_names ## Generator ##Data shape #n_samples = 100 #samples of 1 window (1 window exists of X samples) #resolution = 0.01 # time step between 2 samples #SNR_dB = 50 # Signal to Noise ration in dB #variation = 1 # 0 to 1 (0 to 100%), higher values possible #Gen = Datagenerator2.DataGenerator2(n_samples= n_samples,resolution = resolution,SNR_dB = SNR_dB, variation = variation, n_classes=30, useseed = False, seed = 5) # size var , length var, n classes #if obj.fast_mode and not obj.singleErrorOutput: #x_pool, x_test, y_pool, y_test = obj.GeneratePool(obj.n_windows) #train_windows = None #test_windows = None #else: # windows_pool, windows_test, y_pool, y_test = Gen.GeneratePool(obj.n_windows) # pass elif set_type == "GunPoint": ## Gunpoint Dataset windowLength = 1 #unspecified ! samplesPerWindow = 50 n_classes = 2 n_windows = 50 from pyts.datasets import load_gunpoint windows_pool, windows_test, y_pool, y_test = load_gunpoint(return_X_y=True) class_names = ['gun', 'point'] if y_pool.min() > 0 or y_test.min() > 0 : print("1st class is decoded as zero (was 1)") y_pool -= 1 #class 1 = class 0 y_test -= 1 elif set_type == "Crop": #http://www.timeseriesclassification.com/description.php?Dataset=Crop windowLength = 1 #unspecified ! samplesPerWindow = 46 n_classes = 24 n_windows = 7200 from sktime.utils.load_data import load_from_tsfile_to_dataframe windows_pool, y_pool = load_from_tsfile_to_dataframe("/home/tob/Datasets/Crop_TRAIN.ts") windows_test, y_test = load_from_tsfile_to_dataframe("/home/tob/Datasets/Crop_TEST.ts") windows_pool, windows_test, y_pool, y_test = self.postProcessSetfromTSCcom(windows_pool, windows_test, y_pool, y_test,samplesPerWindow) class_names = ['Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6', 'Class 7', 'Class 8', 'Class 9', 'Class 10', 'Class 11', 'Class 12', 'Class 13', 'Class 14', 'Class 15', 'Class 16', 'Class 17', 'Class 18', 'Class 19', 'Class 20', 'Class 21', 'Class 22', 'Class 23', 'Class 24'] elif set_type == "FaceAll": #http://www.timeseriesclassification.com/description.php?Dataset=FaceAll windowLength = 1 #unspecified ! samplesPerWindow = 131 n_classes = 14 n_windows = 560 from sktime.utils.load_data import load_from_tsfile_to_dataframe windows_pool, y_pool = load_from_tsfile_to_dataframe("/home/tob/Datasets/FaceAll_TRAIN.ts") windows_test, y_test = load_from_tsfile_to_dataframe("/home/tob/Datasets/FaceAll_TEST.ts") windows_pool, windows_test, y_pool, y_test = postProcessSetfromTSCcom(windows_pool, windows_test, y_pool, y_test,samplesPerWindow) class_names = ['Student 1', 'Student 2', 'Student 3', 'Student 4', 'Student 5', 'Student 6', 'Student 7', 'Student 8', 'Student 9', 'Student 10', 'Student 11', 'Student 12', 'Student 13', 'Student 14'] elif set_type == "InsectWingbeat": #http://www.timeseriesclassification.com/description.php?Dataset=InsectWingbeat windowLength = 1 #unspecified ! samplesPerWindow = 30 n_classes = 10 n_windows = 30000 from sktime.utils.load_data import load_from_tsfile_to_dataframe windows_pool, y_pool = load_from_tsfile_to_dataframe("/home/tob/Datasets/InsectWingbeat_TRAIN.ts") windows_test, y_test = load_from_tsfile_to_dataframe("/home/tob/Datasets/InsectWingbeat_TEST.ts") windows_pool, windows_test, y_pool, y_test = postProcessSetfromTSCcom(windows_pool, windows_test, y_pool, y_test,samplesPerWindow) class_names = ['Insect 1', 'Insect 2', 'Insect 3', 'Insect 4', 'Insect 5', 'Insect 6', 'Insect 7', 'Insect 8', 'Insect 9', 'Insect 10'] #make the Dataset imbalanced (class 0 to class 4 matters, class 5 don't care: exists of all other classes) #Setting n_classes = 6 y_pool, y_test = np.clip(y_pool,0,n_classes-1), np.clip(y_test,0,n_classes-1) class_names = class_names[:n_classes] class_names[-1] = "don't care" print("imbalanced classes:") print(class_names) #reshaping for NN #if useNeuralNet == True: # X = np.reshape(X, (X.shape[0],1,X.shape[1])) # Y = np.reshape(Y, (Y.shape[0], 1)) # Y = to_categorical(Y,num_classes=self.n_classes) ## USE CASE 1 POOL : test = complete pool #x_test = np.copy(x_pool) #y_test = np.copy(y_pool) #test_windows = np.copy(train_windows) #obj.visualizeBoss(x_pool,y_pool) #Testing #print("Gen+ FST " + str(time.time() -test )) #End of Data Generation return windows_pool, windows_test, y_pool, y_test, class_names, windowLength, n_classes, samplesPerWindow
this_probs = this_probs * self.cv_accs[c] if output_probs is None: output_probs = this_probs else: output_probs = [[ output_probs[x][y] + this_probs[x][y] for y in range(0, len(output_probs[x])) ] for x in range(0, len(output_probs))] output_probs /= self.cv_sum return output_probs def predict(self, X): probs = self.predict_proba(X) labels = self.classifiers[0].classes_ preds = [labels[np.argmax(probs[x])] for x in range(0, len(probs))] return preds # Example usage with a limited amount of the GunPoint problem if __name__ == "__main__": dataset = "GunPoint" train_x, train_y = load_from_tsfile_to_dataframe( file_path="C:/temp/sktime_temp_data/" + dataset + "/", file_name=dataset + "_TRAIN.ts") ee = ElasticEnsemble() ee.fit(train_x.iloc[0:10], train_y[0:10]) preds = ee.predict(train_x.iloc[10:15]) print(preds)
str(j.start_pos) + "," + str(j.length) + "\n") f.write(",".join( map(str, data[j.series_id, j.start_pos:j.start_pos + j.length])) + "\n") f.write(",".join(map(str, j.data)) + "\n") f.close() if __name__ == "__main__": dataset = "GunPoint" # load_from_arff_to_tsfile("/home/david/arff-datasets/" + dataset + "/" + dataset + "_TRAIN.arff", # "/home/david/sktime-datasets/" + dataset + "/" + dataset + "_TRAIN.ts") # load_from_arff_to_tsfile("/home/david/arff-datasets/" + dataset + "/" + dataset + "_TEST.arff", # "/home/david/sktime-datasets/" + dataset + "/" + dataset + "_TEST.ts") train_x, train_y = load_from_tsfile_to_dataframe( "/home/david/sktime-datasets/" + dataset + "/" + dataset + "_TRAIN.ts") test_x, test_y = load_from_tsfile_to_dataframe( "/home/david/sktime-datasets/" + dataset + "/" + dataset + "_TEST.ts") a = RandomShapeletTransform(type_shapelet="Random", min_shapelet_length=3, max_shapelet_length=12, num_cases_to_sample=8, num_shapelets_to_sample_per_case=10, trim_shapelets=False, remove_self_similar=False, verbose=True) # a = RandomShapeletTransform(type_shapelet="Contracted", time_limit_in_mins=0.3, min_shapelet_length=3, max_shapelet_length=300, # num_shapelets_to_sample_per_case=5, trim_shapelets = False, remove_self_similar = False, verbose=True) # a = RandomShapeletTransform(type_shapelet="Full", verbose=True)
idx[1] += 1 constraints.append(idx) return constraints # change here the name of the extr archive = "Univariate2018_ts" for root, dirs, files in os.walk("./" + archive + "/"): for x in dirs: dataset = "./" + archive + "/" + x + "/" + x print(x) if not os.path.isdir("./" + archive + "_a2cnes/" + x): print(' --- > create') train_x, train_y = load_from_tsfile_to_dataframe(dataset + "_TRAIN.ts") test_x, test_y = load_from_tsfile_to_dataframe(dataset + "_TEST.ts") # compute min, max, for, normalization df_max = dict([(i, sys.float_info.min) for i in train_x.iloc[0].index]) df_min = dict([(i, sys.float_info.max) for i in train_x.iloc[0].index]) max_length = 0 for index, row in train_x.iterrows(): for id in row.index: max_ = row[id].max() if max_ > df_max[id]: df_max[id] = max_ min_ = row[id].min()
from sktime.utils.load_data import load_from_tsfile_to_dataframe import os import sktime import numpy as np import pandas as pd import csv # 全局变量需要修改 DATA_PATH = os.path.join(os.path.dirname(sktime.__file__), "F:/BasicMotions/") datasetname ='BasicMotions' train_x, train_y = load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, datasetname+"_TRAIN.ts")) test_x, test_y = load_from_tsfile_to_dataframe(os.path.join(DATA_PATH, datasetname+"_TEST.ts")) #处理训练数据集,一个样本转换为一个csv文件 list_train_x=train_x.values.tolist() for i in range(len(list_train_x)): list2=list_train_x[i] list3=zip(*list2) with open(DATA_PATH+'1.original/train/train'+str(i+1)+'.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) for row in list3: writer.writerow(row) # 将标签统一用0,1,2...来进行表示(标签对应的序号可能跟标签出现的先后没有关系) list_train_y = pd.Categorical(train_y).codes #数据类型为:numpy.ndarray np.savetxt(DATA_PATH+'1.original/train/train_label.csv',list_train_y,fmt='%d',delimiter=',')
] n_datasets = len(datasets) # Run the fit and predict for i, dataset in enumerate(datasets): print(f'Dataset: {i + 1}/{n_datasets} {dataset}') # pre-allocate results results = np.zeros(3) # load data train_file = os.path.join(data_path, f'{dataset}/{dataset}_TRAIN.ts') test_file = os.path.join(data_path, f'{dataset}/{dataset}_TEST.ts') x_train, y_train = load_from_tsfile_to_dataframe(train_file) x_test, y_test = load_from_tsfile_to_dataframe(test_file) tsf = TimeSeriesForest() # fit try: s = time.time() tsf.fit(x_train, y_train) results[0] = time.time() - s # predict s = time.time() y_pred = tsf.predict(x_test) results[1] = time.time() - s
import os from sktime.utils.load_data import load_from_tsfile_to_dataframe import constants as const # data_dir = const.DATA_DIR file_name_train = const.FILE_NAME_TRAIN train_x, train_y = load_from_tsfile_to_dataframe( os.path.join(data_dir, file_name_train))
'KNeighborsTimeSeriesClassifier', 'ShapeDTW', 'RandomIntervalSpectralForest', 'TimeSeriesForest' ] metamodels = [ 'MetaKNeighbors', 'MetaRandomForest', 'MetaLogisticRegression', 'MetaLSTM' ] dataset = sys.argv[1] _, y_test = load_from_tsfile_to_dataframe('../datasets/Univariate_ts/'+dataset+'/'+dataset+'_TEST.ts') y_test = pd.Series(y_test, dtype='float64', name='y_true_test') classifiers_test_predictions = pd.DataFrame() best_individual_classifier = {} for classifier in classifiers: test_predictions = pd.read_csv('../datasets/Univariate_ts/'+dataset+'/'+classifier+'_PREDICTION_TEST.csv') individual_metric = get_metrics(y_test.astype(int), test_predictions) if 'acc' not in best_individual_classifier.keys(): best_individual_classifier['classifier'] = classifier best_individual_classifier['metrics'] = individual_metric
import numpy as np from pandas import DataFrame from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sktime.classification.compose import ColumnEnsembleClassifier, TimeSeriesForestClassifier from sktime.datasets import load_airline, load_arrow_head from sktime.forecasting.theta import ThetaForecaster from sktime.forecasting.model_selection import temporal_train_test_split from sktime.performance_metrics.forecasting import smape_loss from sktime.utils.load_data import load_from_tsfile_to_dataframe from db import block_data from main import prepare_pools, prepare_average_assessment_windows, prepare_average_luck_windows from prediction import predictor if __name__ == "__main__": data_2d_list = [[10, -1, 2], [10, +1, 3], [10, -1, 4], [10, +1, 5], [10, -1, 6], [10, +1, 7], [10, -1, 8], [10, +1, 9]] X, y = load_from_tsfile_to_dataframe('/home/jamshid/PycharmProjects/pool-analysis/prediction/test_pandas_data.ts') X_train, X_test, y_train, y_test = train_test_split(X, y) classifier = ColumnEnsembleClassifier(estimators=[ ("TSF1", TimeSeriesForestClassifier(n_estimators=100), [1]), ]) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) print(str(X_train)) print(str(y_pred)) print(str(accuracy_score(y_test, y_pred)))
def transform_ts_to_npy_format(source_root_directory, archive_name, output_root_directory): for dataset_name in dataset_names_for_archive[archive_name]: root_dir_dataset = source_root_directory + '/archives/' + archive_name + '_ts/' + dataset_name + '/' out_dir = output_root_directory + '/archives/' + archive_name + '_npy/' + dataset_name + '/' if create_directory(out_dir) is None: print('MAT to NPY transformation was already done for dataset {}'. format(dataset_name)) continue df_train_x, y_train = load_from_tsfile_to_dataframe( os.path.join(root_dir_dataset, dataset_name + '_TRAIN.ts')) df_test_x, y_test = load_from_tsfile_to_dataframe( os.path.join(root_dir_dataset, dataset_name + '_TEST.ts')) def to_numpy(df): columns = list(df) numpy_list = [] print('Processing dataset of size {}'.format(len(df))) for i, row in df.iterrows(): if i % 100 == 0: print("Done {} / {}".format(i, len(df))) channel_arrays = [] ns = [] for c in columns: channel_arrays.append(row[c].to_numpy()) ns.append(len(row[c])) # Pad to same length if len(channel_arrays) > 1: N = max(ns) channel_arrays_sl = [] for ch in channel_arrays: channel_arrays_sl.append( np.pad(ch, [(0, N - len(ch))], mode='constant')) channel_arrays = channel_arrays_sl numpy_list.append(np.stack(channel_arrays, axis=0)) return numpy_list x_train = to_numpy(df_train_x) x_test = to_numpy(df_test_x) # print(type(x_train.shape)) # print(x_test.shape) # print("############") # print(y_train) # print(y_test) # znorm # std_ = x_train.std(axis=1, keepdims=True) # std_[std_ == 0] = 1.0 # x_train = (x_train - x_train.mean(axis=1, keepdims=True)) / std_ # std_ = x_test.std(axis=1, keepdims=True) # std_[std_ == 0] = 1.0 # x_test = (x_test - x_test.mean(axis=1, keepdims=True)) / std_ n_var = x_train[0].shape[0] max_length = get_func_length(x_train, x_test, func=max) min_length = get_func_length(x_train, x_test, func=min) print(dataset_name, 'max', max_length, 'min', min_length) print() print("Train data shape: ", x_train[0].shape) print("Test data shape: ", x_test[0].shape) x_train = transform_to_same_length(x_train, n_var, max_length) x_test = transform_to_same_length(x_test, n_var, max_length) print("Train data shape: ", x_train.shape) print("Test data shape: ", x_test.shape) # continue # save them np.save(out_dir + 'x_train.npy', x_train) np.save(out_dir + 'y_train.npy', y_train) np.save(out_dir + 'x_test.npy', x_test) np.save(out_dir + 'y_test.npy', y_test) print('Successfully transformed dataset {} from TS to NPY.'.format( dataset_name))
def __init__(self, filepath: pathlib.Path, transform=None) -> None: self.transform = transform self.x, self.y = load_from_tsfile_to_dataframe(str(filepath))