def fit(self, data): if data.kfold > 1: cv_eval = {} for k, cv_fold in enumerate(data.Xy_train.keys()): # print(' cv_fold: ', cv_fold) [(X_train, y_train), (X_val, y_val)] = data.Xy_train[cv_fold] X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val) knn = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1) knn.fit(X_train, y_train) eval_metrics = weareval.eval_output(knn.predict(X_val), y_val, tasktype=data.tasktype) cv_eval[cv_fold] = {'model': knn, # 'data': [(X_train, y_train), (X_val, y_val)], # store just IDs? 'metric': eval_metrics['mae'] if data.tasktype=='regression' else eval_metrics['balanced_acc_adj'], 'metrics': eval_metrics} # retain only best model tmp = {cv_fold:cv_eval[cv_fold]['metric'] for cv_fold in cv_eval.keys()} bst_fold = min(tmp, key=tmp.get) if data.tasktype=='regression' else max(tmp, key=tmp.get) self.knn = cv_eval[bst_fold]['model'] return {'model': self.knn, 'metrics': cv_eval[bst_fold]['metrics']} else: X_train, y_train = data.Xy_train X_val, y_val = data.Xy_val X_train, X_val = from_2d_array_to_nested(X_train), from_2d_array_to_nested(X_val) self.knn = knn = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1) self.knn.fit(X_train, y_train) eval_metrics = weareval.eval_output(self.knn.predict(X_val), y_val, tasktype=data.tasktype) return {'model': self.knn, 'metrics': eval_metrics}
class ShapeDTW(BaseClassifier): """ShapeDTW classifier. ShapeDTW[1] works by initially extracting a set of subsequences describing local neighbourhoods around each data point in a time series. These subsequences are then passed into a shape descriptor function that transforms these local neighbourhoods into a new representation. This new representation is then sent into DTW with 1-NN. Parameters ---------- n_neighbours : int, int, set k for knn (default =1). subsequence_length : int, defines the length of the subsequences(default=sqrt(n_timepoints)). shape_descriptor_function : string, defines the function to describe the set of subsequences (default = 'raw'). The possible shape descriptor functions are as follows: - 'raw' : use the raw subsequence as the shape descriptor function. - params = None - 'paa' : use PAA as the shape descriptor function. - params = num_intervals_paa (default=8) - 'dwt' : use DWT (Discrete Wavelet Transform) as the shape descriptor function. - params = num_levels_dwt (default=3) - 'slope' : use the gradient of each subsequence fitted by a total least squares regression as the shape descriptor function. - params = num_intervals_slope (default=8) - 'derivative' : use the derivative of each subsequence as the shape descriptor function. - params = None - 'hog1d' : use a histogram of gradients in one dimension as the shape desciptor function. - params = num_intervals_hog1d (defualt=2) = num_bins_hod1d (default=8) = scaling_factor_hog1d (default=0.1) - 'compound' : use a combination of two shape descriptors simultaneously. - params = weighting_factor (default=None) Defines how to scale values of a shape descriptor. If a value is not given, this value is tuned by 10-fold cross-validation on the training data. shape_descriptor_functions : string list, only applicable when the shape_descriptor_function is set to 'compound'. Use a list of shape descriptor functions at the same time. (default = ['raw','derivative']) metric_params : dictionary for metric parameters (default = None). Notes ----- ..[1] Jiaping Zhao and Laurent Itti, "shapeDTW: Shape Dynamic Time Warping", Pattern Recognition, 74, pp 171-184, 2018 http://www.sciencedirect.com/science/article/pii/S0031320317303710, """ def __init__( self, n_neighbours=1, subsequence_length=30, shape_descriptor_function="raw", shape_descriptor_functions=["raw", "derivative"], # noqa from flake8 B006 metric_params=None, ): self.n_neighbors = n_neighbours self.subsequence_length = subsequence_length self.shape_descriptor_function = shape_descriptor_function self.shape_descriptor_functions = shape_descriptor_functions self.metric_params = metric_params super(ShapeDTW, self).__init__() def _fit(self, X, y): """Train the classifier. Parameters ---------- X - pandas dataframe of training data of shape [n_instances,1]. y - list of class labels of shape [n_instances]. Returns ------- self : the shapeDTW object """ # Perform preprocessing on params. if not (isinstance(self.shape_descriptor_function, str)): raise TypeError("shape_descriptor_function must be an 'str'. \ Found '" + type(self.shape_descriptor_function).__name__ + "' instead.") if self.metric_params is None: self.metric_params = {} # If the shape descriptor is 'compound', # calculate the appropriate weighting_factor if self.shape_descriptor_function == "compound": self._calculate_weighting_factor_value(X, y) # Fit the SlidingWindowSegmenter sw = SlidingWindowSegmenter(self.subsequence_length) sw.fit(X) self.sw = sw # Transform the training data. X = self._preprocess(X) # Fit the kNN classifier self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=self.n_neighbors) self.knn.fit(X, y) self.classes_ = self.knn.classes_ return self def _calculate_weighting_factor_value(self, X, y): """Calculate the appropriate weighting_factor. Check for the compound shape descriptor. If a value is given, the weighting_factor is set as the given value. If not, its tuned via a 10-fold cross-validation on the training data. Parameters ---------- X - training data in a dataframe of shape [n_instances,1] y - training data classes of shape [n_instances]. """ self.metric_params = { k.lower(): v for k, v in self.metric_params.items() } # Get the weighting_factor if one is provided if self.metric_params.get("weighting_factor") is not None: self.weighting_factor = self.metric_params.get("weighting_factor") else: # Tune it otherwise self._param_matrix = { "metric_params": [ { "weighting_factor": 0.1 }, { "weighting_factor": 0.125 }, { "weighting_factor": (1 / 6) }, { "weighting_factor": 0.25 }, { "weighting_factor": 0.5 }, { "weighting_factor": 1 }, { "weighting_factor": 2 }, { "weighting_factor": 4 }, { "weighting_factor": 6 }, { "weighting_factor": 8 }, { "weighting_factor": 10 }, ] } n = self.n_neighbors sl = self.subsequence_length sdf = self.shape_descriptor_function sdfs = self.shape_descriptor_functions if sdfs is None or not (len(sdfs) == 2): raise ValueError("When using 'compound', " + "shape_descriptor_functions must be a " + "string array of length 2.") mp = self.metric_params grid = GridSearchCV( estimator=ShapeDTW( n_neighbours=n, subsequence_length=sl, shape_descriptor_function=sdf, shape_descriptor_functions=sdfs, metric_params=mp, ), param_grid=self._param_matrix, cv=KFold(n_splits=10, shuffle=True), scoring="accuracy", ) grid.fit(X, y) self.weighting_factor = grid.best_params_["metric_params"][ "weighting_factor"] def _preprocess(self, X): # private method for performing the transformations on # the test/training data. It extracts the subsequences # and then performs the shape descriptor function on # each subsequence. X = self.sw.transform(X) # Feed X into the appropriate shape descriptor function X = self._generate_shape_descriptors(X) return X def _predict_proba(self, X): """Perform predictions on the testing data X. This function returns the probabilities for each class. Parameters ---------- X - pandas dataframe of testing data of shape [n_instances,1]. Returns ------- output : numpy array of shape = [n_instances, num_classes] of probabilities """ # Transform the test data in the same way as the training data. X = self._preprocess(X) # Classify the test data return self.knn.predict_proba(X) def _predict(self, X): """Find predictions for all cases in X. Parameters ---------- X : The testing input samples of shape [n_instances,1]. Returns ------- output : numpy array of shape = [n_instances] """ # Transform the test data in the same way as the training data. X = self._preprocess(X) # Classify the test data return self.knn.predict(X) def _generate_shape_descriptors(self, data): """Generate shape descriptors. This function is used to convert a list of subsequences into a list of shape descriptors to be used for classification. """ # Get the appropriate transformer objects if self.shape_descriptor_function != "compound": self.transformer = [ self._get_transformer(self.shape_descriptor_function) ] else: self.transformer = [] for x in self.shape_descriptor_functions: self.transformer.append(self._get_transformer(x)) if not (len(self.transformer) == 2): raise ValueError("When using 'compound', " + "shape_descriptor_functions must be a " + "string array of length 2.") # To hold the result of each transformer dataFrames = [] col_names = [x for x in range(len(data.columns))] # Apply each transformer on the set of subsequences for t in self.transformer: if t is None: # Do no transformations dataFrames.append(data) else: # Do the transformation and extract the resulting data frame. t.fit(data) newData = t.transform(data) dataFrames.append(newData) # Combine the arrays into one dataframe if self.shape_descriptor_function == "compound": result = self._combine_data_frames(dataFrames, self.weighting_factor, col_names) else: result = dataFrames[0] result.columns = col_names return result def _get_transformer(self, tName): """Extract the appropriate transformer. Parameters ---------- self : the ShapeDTW object. tName : the name of the required transformer. Returns ------- output : Base Transformer object corresponding to the class (or classes if its a compound transformer) of the required transformer. The transformer is configured with the parameters given in self.metric_params. throws : ValueError if a shape descriptor doesn't exist. """ parameters = self.metric_params tName = tName.lower() if parameters is None: parameters = {} parameters = {k.lower(): v for k, v in parameters.items()} self._check_metric_params(parameters) if tName == "raw": return None elif tName == "paa": num_intervals = parameters.get("num_intervals_paa") if num_intervals is None: return PAA() return PAA(num_intervals) elif tName == "dwt": num_levels = parameters.get("num_levels_dwt") if num_levels is None: return DWTTransformer() return DWTTransformer(num_levels) elif tName == "slope": num_intervals = parameters.get("num_intervals_slope") if num_intervals is None: return SlopeTransformer() return SlopeTransformer(num_intervals) elif tName == "derivative": return DerivativeSlopeTransformer() elif tName == "hog1d": num_intervals = parameters.get("num_intervals_hog1d") num_bins = parameters.get("num_bins_hog1d") scaling_factor = parameters.get("scaling_factor_hog1d") # All 3 paramaters are None if num_intervals is None and num_bins is None and scaling_factor is None: return HOG1DTransformer() # 2 parameters are None if num_intervals is None and num_bins is None: return HOG1DTransformer(scaling_factor=scaling_factor) if num_intervals is None and scaling_factor is None: return HOG1DTransformer(num_bins=num_bins) if num_bins is None and scaling_factor is None: return HOG1DTransformer(num_intervals=num_intervals) # 1 parameter is None if num_intervals is None: return HOG1DTransformer(scaling_factor=scaling_factor, num_bins=num_bins) if scaling_factor is None: return HOG1DTransformer(num_intervals=num_intervals, num_bins=num_bins) if num_bins is None: return HOG1DTransformer(scaling_factor=scaling_factor, num_intervals=num_intervals) # All parameters are given return HOG1DTransformer( num_intervals=num_intervals, num_bins=num_bins, scaling_factor=scaling_factor, ) else: raise ValueError("Invalid shape desciptor function.") def _check_metric_params(self, parameters): """Check for an invalid metric_params.""" valid_metric_params = [ "num_intervals_paa", "num_levels_dwt", "num_intervals_slope", "num_intervals_hog1d", "num_bins_hog1d", "scaling_factor_hog1d", "weighting_factor", ] names = list(parameters.keys()) for x in names: if not (x in valid_metric_params): raise ValueError(x + " is not a valid metric parameter." + "Make sure the shape descriptor function" + " name is at the end of the metric " + "parameter name.") def _combine_data_frames(self, dataFrames, weighting_factor, col_names): """Combine two dataframes together into a single dataframe. Used when the shape_descriptor_function is set to "compound". """ first_desc = dataFrames[0] second_desc = dataFrames[1] first_desc_array = [] second_desc_array = [] # Convert the dataframes into arrays for x in first_desc.columns: first_desc_array.append( from_nested_to_2d_array(first_desc[x], return_numpy=True)) for x in second_desc.columns: second_desc_array.append( from_nested_to_2d_array(second_desc[x], return_numpy=True)) # Concatenate the arrays together res = [] for x in range(len(first_desc_array)): dim1 = [] for y in range(len(first_desc_array[x])): dim2 = [] dim2.extend(first_desc_array[x][y]) dim2.extend(second_desc_array[x][y] * weighting_factor) dim1.append(dim2) res.append(dim1) res = np.asarray(res) # Convert to pandas dataframe df = pd.DataFrame() for col in col_names: colToAdd = [] for row in range(len(res[col])): inst = res[col][row] colToAdd.append(pd.Series(inst)) df[col] = colToAdd return df
test_file = os.path.join(data_path, f'{dataset}/{dataset}_TEST.ts') x_train, y_train = load_from_tsfile_to_dataframe(train_file) x_test, y_test = load_from_tsfile_to_dataframe(test_file) tsf = KNeighborsTimeSeriesClassifier() # fit try: s = time.time() tsf.fit(x_train, y_train) results[0] = time.time() - s # predict s = time.time() y_pred = tsf.predict(x_test) results[1] = time.time() - s # catch and raise user exceptions except (KeyboardInterrupt, SystemExit): raise # skip over all other exception except: print("error - skipping: ", dataset) continue # score results[2] = accuracy_score(y_test, y_pred) print('{},{},{},{}\n'.format(dataset, results[0], results[1], results[2]))