def test_teaser_near_classification_points(): """Test of TEASER with incremental time stamps outside defined class points.""" X_train, y_train, X_test, y_test, indices = load_unit_data() # train probability threshold teaser = TEASER( random_state=0, classification_points=[6, 10, 14, 18, 24], estimator=TimeSeriesForestClassifier(n_estimators=10, random_state=0), ) teaser.fit(X_train, y_train) # use test_points that are not within list above test_points = [7, 11, 19, 20] X_test = from_nested_to_3d_numpy(X_test) states = None for i in test_points: X = X_test[indices, :, :i] if i == 20: with pytest.raises(ValueError): probas, decisions, states = teaser.predict_proba( X, state_info=states) else: probas, decisions, states = teaser.predict_proba(X, state_info=states)
def _transform_words(self, X): if self.use_first_order_differences: X = self._add_first_order_differences(X) bag_all_words = [dict() for _ in range(len(X))] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) for i, window_size in enumerate(self.window_sizes[ind]): # SFA transform sfa_words = self.SFA_transformers[ind][i].transform(X_dim) bag = sfa_words[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # append the prefices to the words to distinguish # between window-sizes word = MUSE._shift_left(key, highest, ind, self.highest_dim_bit, window_size) bag_all_words[j][word] = value return bag_all_words
def wrapper(self, data, labels=None, **kwargs): # Check if pandas so we can convert back is_pandas = True if isinstance(data, pd.DataFrame) else False pd_idx = data.index if is_pandas else None # Fit checks if check_fitted: self.check_is_fitted() # First convert to pandas so everything is the same format if labels is None: data = check_X(data, coerce_to_pandas=True) else: data, labels = check_X_y(data, labels, coerce_to_pandas=True) # Now convert it to a numpy array # Note sktime uses [N, C, L] whereas signature code uses shape # [N, L, C] (C being channels) so we must transpose. data = np.transpose(from_nested_to_3d_numpy(data), [0, 2, 1]) # Apply the function to the transposed array if labels is None: output = func(self, data, **kwargs) else: output = func(self, data, labels, **kwargs) # Convert back if all( [is_pandas, isinstance(output, np.ndarray), not force_numpy]): output = pd.DataFrame(index=pd_idx, data=output) return output
def test_prob_threshold_on_unit_test_data(): """Test of ProbabilityThresholdEarlyClassifier on unit test data.""" # load unit test data X_train, y_train = load_unit_test(split="train", return_X_y=True) X_test, y_test = load_unit_test(split="test", return_X_y=True) indices = np.random.RandomState(0).choice(len(y_train), 10, replace=False) # train probability threshold pt = ProbabilityThresholdEarlyClassifier( random_state=0, classification_points=[6, 16, 24], probability_threshold=1, estimator=TimeSeriesForestClassifier(n_estimators=10, random_state=0), ) pt.fit(X_train, y_train) final_probas = np.zeros((10, 2)) final_decisions = np.zeros(10) X_test = from_nested_to_3d_numpy(X_test) states = None for i in pt.classification_points: X = X_test[indices, :, :i] probas = pt.predict_proba(X) decisions, states = pt.decide_prediction_safety(X, probas, states) for n in range(10): if decisions[n] and final_decisions[n] == 0: final_probas[n] = probas[n] final_decisions[n] = i testing.assert_array_equal(final_probas, pt_unit_test_probas)
def test_teaser_with_different_decision_maker(): """Test of TEASER with different One-Class-Classifier.""" X_train, y_train, X_test, y_test, indices = load_unit_data() # train probability threshold teaser = TEASER( random_state=0, classification_points=[6, 10, 16, 24], estimator=TimeSeriesForestClassifier(n_estimators=10, random_state=0), one_class_classifier=IsolationForest(n_estimators=5), one_class_param_grid={"bootstrap": [True, False]}, ) teaser.fit(X_train, y_train) final_probas = np.zeros((10, 2)) final_decisions = np.zeros(10) X_test = from_nested_to_3d_numpy(X_test) states = None for i in teaser.classification_points: X = X_test[indices, :, :i] probas, decisions, states = teaser.predict_proba(X, state_info=states) for n in range(10): if decisions[n] and final_decisions[n] == 0: final_probas[n] = probas[n] final_decisions[n] = i testing.assert_array_equal(final_probas, teaser_if_unit_test_probas)
def test_teaser_on_unit_test_data(): """Test of TEASER on unit test data.""" X_train, y_train, X_test, y_test, indices = load_unit_data() # train probability threshold teaser = TEASER( random_state=0, classification_points=[6, 10, 16, 24], estimator=TimeSeriesForestClassifier(n_estimators=10, random_state=0), ) teaser.fit(X_train, y_train) final_probas = np.zeros((10, 2)) final_decisions = np.zeros(10) X_test = from_nested_to_3d_numpy(X_test) states = None for i in teaser.classification_points: X = X_test[indices, :, :i] probas, decisions, states = teaser.predict_proba(X, state_info=states) for n in range(10): if decisions[n] and final_decisions[n] == 0: final_probas[n] = probas[n] final_decisions[n] = i testing.assert_array_equal(final_probas, teaser_unit_test_probas)
def test_from_nested_to_3d_numpy(n_instances, n_columns, n_timepoints): """Test from_nested_to_3d_numpy for correctness.""" nested, _ = make_classification_problem(n_instances, n_columns, n_timepoints) array = from_nested_to_3d_numpy(nested) # check types and shapes assert isinstance(array, np.ndarray) assert array.shape == (n_instances, n_columns, n_timepoints) # check values of random series np.testing.assert_array_equal(nested.iloc[1, 0], array[1, 0, :])
def _fit_local(self, local_X: pd.DataFrame, local_y: np.ndarray, clone: bool = False) -> None: """Fit the local classifier. Args: local_X (pd.DataFrame): Training data (sensor windows, in sktime nested DataFrame format). local_y (np.ndarray): Training labels (0's and 1's). clone (bool, optional): Clone flag. True for internal cross validation. Defaults to False. """ if len(local_X.columns) > 1: local_X = convert.from_nested_to_3d_numpy(local_X) clf = self.__local_clf2 if clone else self.local_clf clf.fit(local_X, local_y)
def _reproduce_early_classification_unit_test(estimator): X_train, y_train = load_unit_test(split="train") X_test, y_test = load_unit_test(split="test") indices = np.random.RandomState(0).choice(len(y_train), 10, replace=False) estimator.fit(X_train, y_train) final_probas = np.zeros((10, 2)) final_decisions = np.zeros(10) X_test = from_nested_to_3d_numpy(X_test) states = None for i in estimator.classification_points: X = X_test[indices, :, :i] probas, decisions, states = estimator.predict_proba(X, state_info=states) for n in range(10): if decisions[n] and final_decisions[n] == 0: final_probas[n] = probas[n] final_decisions[n] = i return final_probas
def transform_single_feature(self, X, feature, case_id=None): """Transform data into a specified catch22 feature. Parameters ---------- X : pandas DataFrame, input time series. Currently univariate only. feature : int, catch22 feature id or String, catch22 feature name. case_id : int, identifier for the current set of cases. If the case_id is not None and the same as the previously used case_id, calculations from previous features will be reused. Returns ------- Numpy array containing a catch22 feature for each input series. """ if isinstance(feature, (int, np.integer)) or isinstance( feature, (float, float) ): if feature > 21 or feature < 0: raise ValueError("Invalid catch22 feature ID") elif isinstance(feature, str): if feature in feature_names: feature = feature_names.index(feature) else: raise ValueError("Invalid catch22 feature name") else: raise ValueError("catch22 feature name or ID required") if isinstance(X, pd.DataFrame): X = from_nested_to_3d_numpy(X) if len(X.shape) > 2: n_instances, n_dims, series_length = X.shape if n_dims > 1: raise ValueError( "transform_single_feature can only handle univariate series " "currently." ) X = np.reshape(X, (n_instances, -1)) else: n_instances, series_length = X.shape if case_id is not None: if case_id != self._case_id: self._case_id = case_id self._st_n_instances = n_instances self._st_series_length = series_length self._outlier_series = [None] * n_instances self._smin = [None] * n_instances self._smax = [None] * n_instances self._smean = [None] * n_instances self._fft = [None] * n_instances self._ac = [None] * n_instances self._acfz = [None] * n_instances else: if ( n_instances != self._st_n_instances or series_length != self._st_series_length ): raise ValueError( "Catch22: case_is the same, but n_instances and " "series_length do not match last seen for single " "feature transform." ) c22_list = Parallel(n_jobs=self.n_jobs)( delayed(self._transform_case_single)( X[i], feature, case_id, i, ) for i in range(n_instances) ) if self.replace_nans: c22_list = np.nan_to_num(c22_list, False, 0, 0, 0) return np.asarray(c22_list)
def _fit(self, X, y): """Build a WEASEL+MUSE classifiers from the training set (X, y). Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ y = np.asarray(y) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] # add first order differences in each dimension to TS if self.use_first_order_differences: X = self._add_first_order_differences(X) # Window length parameter space dependent on series length self.col_names = X.columns rng = check_random_state(self.random_state) self.n_dims = len(self.col_names) self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1 self.highest_bits = np.zeros(self.n_dims) self.SFA_transformers = [[] for _ in range(self.n_dims)] # the words of all dimensions and all time series all_words = [dict() for _ in range(X.shape[0])] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) series_length = X_dim.shape[ -1] # TODO compute minimum over all ts ? # increment window size in steps of 'win_inc' win_inc = self._compute_window_inc(series_length) self.max_window = int(min(series_length, self.max_window)) if self.min_window > self.max_window: raise ValueError( f"Error in MUSE, min_window =" f"{self.min_window} is bigger" f" than max_window ={self.max_window}," f" series length is {self.series_length}" f" try set min_window to be smaller than series length in " f"the constructor, but the classifier may not work at " f"all with very short series") self.window_sizes.append( list(range(self.min_window, self.max_window, win_inc))) self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1 for window_size in self.window_sizes[ind]: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, binning_method=rng.choice(self.binning_strategies), bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X_dim, y) self.SFA_transformers[ind].append(transformer) bag = sfa_words[0] # chi-squared test to keep only relevant features relevant_features = {} apply_chi_squared = self.p_threshold < 1 if apply_chi_squared: vectorizer = DictVectorizer(sparse=True, dtype=np.int32, sort=False) bag_vec = vectorizer.fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features_idx = np.where(p <= self.p_threshold)[0] relevant_features = set( np.array( vectorizer.feature_names_)[relevant_features_idx]) # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefices to the words to # distinguish between window-sizes word = MUSE._shift_left(key, highest, ind, self.highest_dim_bit, window_size) all_words[j][word] = value self.clf = make_pipeline( DictVectorizer(sparse=True, sort=False), # StandardScaler(with_mean=True, copy=False), LogisticRegression( max_iter=5000, solver="liblinear", dual=True, # class_weight="balanced", penalty="l2", random_state=self.random_state, ), ) self.clf.fit(all_words, y) return self
def load_from_tsfile( full_file_path_and_name, replace_missing_vals_with="NaN", return_y=True, ): """Load time series data into X and (optionally) y. Data from a .ts file into a an 2D (univariate) or 3D (multivariate) if equal length or Pandas DataFrame if unequal length. If present, y is loaded into a 1D array. Parameters ---------- full_file_path_and_name: str The full pathname of the .ts file to read. replace_missing_vals_with: str, default NaN The value that missing values in the text file should be replaced with prior to parsing. return_y: boolean default True whether to return the y variable, if it is present. Returns ------- X: DataFrame or ndarray y (optional): ndarray. """ # Initialize flags and variables used when parsing the file is_first_case = True instance_list = [] class_val_list = [] line_num = 0 num_dimensions = 0 num_cases = 0 # equal_length = True with open(full_file_path_and_name, "r", encoding="utf-8") as file: _meta_data = _read_header(file, full_file_path_and_name) for line in file: num_cases += 1 line = line.replace("?", replace_missing_vals_with) dimensions = line.split(":") # If first instance then note the number of dimensions. # This must be the same for all cases. if is_first_case: num_dimensions = len(dimensions) if _meta_data["has_class_labels"]: num_dimensions -= 1 instance_list = [[] for _ in range(num_dimensions)] is_first_case = False _meta_data["num_dimensions"] = num_dimensions # See how many dimensions a case has this_line_num_dim = len(dimensions) if _meta_data["has_class_labels"]: this_line_num_dim -= 1 if this_line_num_dim != _meta_data["num_dimensions"]: raise IOError( f"Error input {full_file_path_and_name} all cases must " f"have the {num_dimensions} dimensions. Case " f"{num_cases} has {this_line_num_dim}") # Process the data for each dimension for dim in range(0, _meta_data["num_dimensions"]): dimension = dimensions[dim].strip() if dimension: data_series = dimension.split(",") data_series = [float(i) for i in data_series] instance_list[dim].append(pd.Series(data_series)) else: instance_list[dim].append(pd.Series(dtype="object")) if _meta_data["has_class_labels"]: class_val_list.append( dimensions[_meta_data["num_dimensions"]].strip()) line_num += 1 # Check that the file was not empty if line_num: # Create a DataFrame from the data parsed data = pd.DataFrame(dtype=np.float32) for dim in range(0, _meta_data["num_dimensions"]): data["dim_" + str(dim)] = instance_list[dim] if not _meta_data["has_timestamps"] and _meta_data["is_equal_length"]: if _meta_data["is_univariate"]: data = from_nested_to_2d_np_array(data) else: data = from_nested_to_3d_numpy(data) if return_y and not _meta_data["has_class_labels"]: raise IOError(f"class labels have been requested, but they " f"are not present in the file " f"{full_file_path_and_name}") if _meta_data["has_class_labels"] and return_y: return data, np.asarray(class_val_list) else: return data else: raise IOError( f"Empty file {full_file_path_and_name} with header info but no " f"cases")
def check_X( X, enforce_univariate=False, enforce_min_instances=1, enforce_min_columns=1, coerce_to_numpy=False, coerce_to_pandas=False, ): """Validate input data. Parameters ---------- X : pd.DataFrame or np.array Input data enforce_univariate : bool, optional (default=False) Enforce that X is univariate. enforce_min_instances : int, optional (default=1) Enforce minimum number of instances. enforce_min_columns : int, optional (default=1) Enforce minimum number of columns (or time-series variables). coerce_to_numpy : bool, optional (default=False) If True, X will be coerced to a 3-dimensional numpy array. coerce_to_pandas : bool, optional (default=False) If True, X will be coerced to a nested pandas DataFrame. Returns ------- X : pd.DataFrame or np.array Checked and possibly converted input data Raises ------ ValueError If X is invalid input data """ # check input type if coerce_to_pandas and coerce_to_numpy: raise ValueError( "`coerce_to_pandas` and `coerce_to_numpy` cannot both be set to True" ) if not isinstance(X, VALID_X_TYPES): raise ValueError(f"X must be a pd.DataFrame or a np.array, " f"but found: {type(X)}") # check np.array # check first if we have the right number of dimensions, otherwise we # may not be able to get the shape of the second dimension below if isinstance(X, np.ndarray): if not X.ndim == 3: raise ValueError( f"If passed as a np.array, X must be a 3-dimensional " f"array, but found shape: {X.shape}") if coerce_to_pandas: X = from_3d_numpy_to_nested(X) # enforce minimum number of columns n_columns = X.shape[1] if n_columns < enforce_min_columns: raise ValueError( f"X must contain at least: {enforce_min_columns} columns, " f"but found only: {n_columns}.") # enforce univariate data if enforce_univariate and n_columns > 1: raise ValueError( f"X must be univariate with X.shape[1] == 1, but found: " f"X.shape[1] == {n_columns}.") # enforce minimum number of instances if enforce_min_instances > 0: _enforce_min_instances(X, min_instances=enforce_min_instances) # check pd.DataFrame if isinstance(X, pd.DataFrame): if not is_nested_dataframe(X): raise ValueError( "If passed as a pd.DataFrame, X must be a nested " "pd.DataFrame, with pd.Series or np.arrays inside cells.") # convert pd.DataFrame if coerce_to_numpy: X = from_nested_to_3d_numpy(X) return X
def _fit(self, X, y): """Build an ensemble of 1-NN classifiers from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, it must have a single column. BOSS not configured to handle multivariate y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ # Derivative DTW (DDTW) uses the regular DTW algorithm on data that # are transformed into derivatives. # To increase the efficiency of DDTW we can pre-transform the data # into derivatives, and then call the # standard DTW algorithm on it, rather than transforming each series # every time a distance calculation # is made. Please note that using DDTW elsewhere will not benefit # from this speed enhancement if self.distance_measures.__contains__( "ddtw") or self.distance_measures.__contains__("wddtw"): der_X = DerivativeSlopeTransformer().fit_transform(X) # convert back to numpy if isinstance(der_X, pd.DataFrame): der_X = from_nested_to_3d_numpy(der_X) else: der_X = None self.train_accs_by_classifier = np.zeros(len(self.distance_measures)) self.estimators_ = [None] * len(self.distance_measures) rand = np.random.RandomState(self.random_state) # The default EE uses all training instances for setting parameters, # and 100 parameter options per elastic measure. The # prop_train_in_param_finding and prop_of_param_options attributes of this class # can be used to control this however, using fewer cases to optimise # parameters on the training data and/or using less parameter options. # # For using fewer training instances the appropriate number of cases must be # sampled from the data. This is achieved through the use of a deterministic # StratifiedShuffleSplit # # For using fewer parameter options a RandomizedSearchCV is used in # place of a GridSearchCV param_train_x = None der_param_train_x = None param_train_y = None # If using less cases for parameter optimisation, use the # StratifiedShuffleSplit: if self.proportion_train_in_param_finding < 1: if self.verbose > 0: print( # noqa: T001 "Restricting training cases for parameter optimisation: ", end="") sss = StratifiedShuffleSplit( n_splits=1, test_size=1 - self.proportion_train_in_param_finding, random_state=rand, ) for train_index, _ in sss.split(X, y): param_train_x = X[train_index, :] param_train_y = y[train_index] if der_X is not None: der_param_train_x = der_X[train_index, :] if self.verbose > 0: print( # noqa: T001 "using " + str(len(param_train_x)) + " training cases instead of " + str(len(X)) + " for parameter optimisation") # else, use the full training data for optimising parameters else: if self.verbose > 0: print( # noqa: T001 "Using all training cases for parameter optimisation") param_train_x = X param_train_y = y if der_X is not None: der_param_train_x = der_X self.constituent_build_times = [] if self.verbose > 0: print( # noqa: T001 "Using " + str(100 * self.proportion_of_param_options) + " parameter " "options per " "measure") for dm in range(0, len(self.distance_measures)): this_measure = self.distance_measures[dm] # uses the appropriate training data as required (either full or # smaller sample as per the StratifiedShuffleSplit) param_train_to_use = param_train_x full_train_to_use = X if this_measure == "ddtw" or this_measure == "wddtw": param_train_to_use = der_param_train_x full_train_to_use = der_X if this_measure == "ddtw": this_measure = "dtw" elif this_measure == "wddtw": this_measure = "wdtw" start_build_time = time.time() if self.verbose > 0: if (self.distance_measures[dm] == "ddtw" or self.distance_measures[dm] == "wddtw"): print( # noqa: T001 "Currently evaluating " + str(self.distance_measures[dm].__name__) + " (implemented as " + str(this_measure.__name__) + " with pre-transformed derivative data)") else: print( # noqa: T001 "Currently evaluating " + str(self.distance_measures[dm].__name__)) # If 100 parameter options are being considered per measure, # use a GridSearchCV if self.proportion_of_param_options == 1: grid = GridSearchCV( estimator=KNeighborsTimeSeriesClassifier( distance=this_measure, n_neighbors=1), param_grid=ElasticEnsemble._get_100_param_options( self.distance_measures[dm], X), cv=LeaveOneOut(), scoring="accuracy", n_jobs=self._threads_to_use, verbose=self.verbose, ) grid.fit(param_train_to_use, param_train_y) # Else, used RandomizedSearchCV to randomly sample parameter # options for each measure else: grid = RandomizedSearchCV( estimator=KNeighborsTimeSeriesClassifier( distance=this_measure, n_neighbors=1), param_distributions=ElasticEnsemble._get_100_param_options( self.distance_measures[dm], X), n_iter=100 * self.proportion_of_param_options, cv=LeaveOneOut(), scoring="accuracy", n_jobs=self._threads_to_use, random_state=rand, verbose=self.verbose, ) grid.fit(param_train_to_use, param_train_y) if self.majority_vote: acc = 1 # once the best parameter option has been estimated on the # training data, perform a final pass with this parameter option # to get the individual predictions with cross_cal_predict ( # Note: optimisation potentially possible here if a GridSearchCV # was used previously. TO-DO: determine how to extract # predictions for the best param option from GridSearchCV) else: best_model = KNeighborsTimeSeriesClassifier( n_neighbors=1, distance=this_measure, distance_params=grid.best_params_["distance_params"], n_jobs=self._threads_to_use, ) preds = cross_val_predict(best_model, full_train_to_use, y, cv=LeaveOneOut()) acc = accuracy_score(y, preds) if self.verbose > 0: print( # noqa: T001 "Training accuracy for " + str(self.distance_measures[dm].__name__) + ": " + str(acc) + " (with parameter setting: " + str(grid.best_params_["distance_params"]) + ")") # Finally, reset the classifier for this measure and parameter # option, ready to be called for test classification best_model = KNeighborsTimeSeriesClassifier( n_neighbors=1, distance=this_measure, distance_params=grid.best_params_["distance_params"], ) best_model.fit(full_train_to_use, y) end_build_time = time.time() self.constituent_build_times.append( str(end_build_time - start_build_time)) self.estimators_[dm] = best_model self.train_accs_by_classifier[dm] = acc return self