def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=True, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.random_state = random_state self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, remove_repeat_words=True, bigrams=False, save_words=save_words, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualBOSS, self).__init__()
def _parallel_fit(window_size, ): rng = check_random_state(window_size) all_words = [dict() for x in range(len(X))] relevant_features_count = 0 # for window_size in self.window_sizes: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, # levels=rng.choice([1, 2, 3]), binning_method=self.binning_strategy, bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X, y) # self.SFA_transformers.append(transformer) bag = sfa_words[0] apply_chi_squared = self.p_threshold < 1 # chi-squared test to keep only relevant features if apply_chi_squared: vectorizer = DictVectorizer(sparse=True, dtype=np.int32, sort=False) bag_vec = vectorizer.fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features_idx = np.where(p <= self.p_threshold)[0] relevant_features = set( np.array(vectorizer.feature_names_)[relevant_features_idx]) relevant_features_count += len(relevant_features_idx) # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefixes to the words to # distinguish between window-sizes word = WEASEL._shift_left(key, self.highest_bit, window_size) all_words[j][word] = value return all_words, transformer, relevant_features_count
def _select_dims(self, X, y): self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1 accs = [] transformers = [] # select dimensions based on reduced bag size accuracy for i in range(self.n_dims): self.dims.append(i) transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, save_words=False, save_binning_dft=True, n_jobs=self.n_jobs, )) X_dim = X[:, i, :].reshape(self.n_instances, 1, self.series_length) transformers[i].fit(X_dim, y) sfa = transformers[i].transform( X_dim, y, transformers[i].binning_dft, ) transformers[i].binning_dft = None correct = 0 for i in range(self.n_instances): if self._train_predict(i, sfa[0]) == y[i]: correct = correct + 1 accs.append(correct) max_acc = max(accs) dims = [] fin_transformers = [] for i in range(self.n_dims): if accs[i] >= max_acc * self.dim_threshold: dims.append(i) fin_transformers.append(transformers[i]) if len(dims) > self.max_dims: idx = self.random_state.choice( len(dims), self.max_dims, replace=False, ).tolist() dims = [dims[i] for i in idx] fin_transformers = [fin_transformers[i] for i in idx] return dims, fin_transformers
def _fit(self, X, y): """Fit a single base TDE classifier on n_instances cases (X,y). Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self.n_instances_, self.n_dims_, self.series_length_ = X.shape self._class_vals = y # select dimensions using accuracy estimate if multivariate if self.n_dims_ > 1: self._dims, self._transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances_)] for i, dim in enumerate(self._dims): X_dim = X[:, dim, :].reshape(self.n_instances_, 1, self.series_length_) dim_words = self._transformers[i].transform(X_dim, y) dim_words = dim_words[0] for n in range(self.n_instances_): for word, count in dim_words[n].items(): words[n][word << self._highest_dim_bit | dim] = count self._transformed_data = words else: self._transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, lower_bounding=False, save_words=False, use_fallback_dft=True, n_jobs=self._threads_to_use, )) sfa = self._transformers[0].fit_transform(X, y) self._transformed_data = sfa[0]
def fit(self, X, y): """Fit a single TD classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index # select dimensions using accuracy estimate if multivariate if self.n_dims > 1: self.dims, self.transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances)] for i, dim in enumerate(self.dims): X_dim = X[:, dim, :].reshape(self.n_instances, 1, self.series_length) dim_words = self.transformers[i].transform(X_dim, y) dim_words = dim_words[0] for i in range(self.n_instances): for word, count in dim_words[i].items(): words[i][word << self.highest_dim_bit | dim] = count self.transformed_data = words else: self.transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, save_words=False, n_jobs=self.n_jobs, )) sfa = self.transformers[0].fit_transform(X, y) self.transformed_data = sfa[0] self._is_fitted = True return self
def __init__( self, window_size=10, word_length=8, norm=False, levels=1, igb=False, alphabet_size=4, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.levels = levels self.igb = igb self.alphabet_size = alphabet_size self.random_state = random_state binning_method = "information-gain" if igb else "equi-depth" self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, levels=levels, binning_method=binning_method, bigrams=True, remove_repeat_words=True, save_words=False, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualTDE, self).__init__()
def _fit(self, X, y): """Fit a single boss classifier on n_instances cases (X,y). Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self._transformer = SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, remove_repeat_words=True, bigrams=False, save_words=self.save_words, typed_dict=self.typed_dict, n_jobs=self._threads_to_use, ) sfa = self._transformer.fit_transform(X) self._transformed_data = sfa[0] self._class_vals = y return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self._class_vals = y self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index # select dimensions using accuracy estimate if multivariate if self.n_dims > 1: self._dims, self._transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances)] for i, dim in enumerate(self._dims): X_dim = X[:, dim, :].reshape(self.n_instances, 1, self.series_length) dim_words = self._transformers[i].transform(X_dim, y) dim_words = dim_words[0] for n in range(self.n_instances): for word, count in dim_words[n].items(): words[n][word << self._highest_dim_bit | dim] = count self._transformed_data = words else: self._transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, lower_bounding=False, save_words=False, use_fallback_dft=True, n_jobs=self._n_jobs, )) sfa = self._transformers[0].fit_transform(X, y) self._transformed_data = sfa[0]
class IndividualBOSS(BaseClassifier): """Single bag of Symbolic Fourier Approximation Symbols (IndividualBOSS). Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base classifier for the boss ensemble. Implementation of single BOSS model from Schäfer (2015). [1]_ This is the underlying classifier for each classifier in the BOSS ensemble. Overview: input "n" series of length "m" and IndividualBoss performs a SFA transform to form a sparse dictionary of discretised words. The resulting dictionary is used with the BOSS distance function in a 1-nearest neighbor. Fit involves finding "n" histograms. Predict uses 1 nearest neighbor with a bespoke BOSS distance function. Parameters ---------- window_size : int Size of the window to use in BOSS algorithm. word_length : int Length of word to use to use in BOSS algorithm. norm : bool, default = False Whether to normalize words by dropping the first Fourier coefficient. alphabet_size : default = 4 Number of possible letters (values) for each word. save_words : bool, default = True Whether to keep NumPy array of words in SFA transformation even after the dictionary of words is returned. If True, the array is saved, which can shorten the time to calculate dictionaries using a shorter `word_length` (since the last "n" letters can be removed). typed_dict : bool, default=True Use a numba TypedDict to store word counts. May increase memory usage, but will be faster for larger datasets. n_jobs : int, default=1 The number of jobs to run in parallel for both `fit` and `predict`. ``-1`` means using all processors. random_state : int or None, default=None Seed for random, integer. Attributes ---------- n_classes_ : int Number of classes. Extracted from the data. classes_ : list The classes labels. See Also -------- BOSSEnsemble, ContractableBOSS Notes ----- For the Java version, see `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/ tsml/classifiers/dictionary_based/IndividualBOSS.java>`_. References ---------- .. [1] Patrick Schäfer, "The BOSS is concerned with time series classification in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015 https://link.springer.com/article/10.1007/s10618-014-0377-7 Examples -------- >>> from sktime.classification.dictionary_based import IndividualBOSS >>> from sktime.datasets import load_unit_test >>> X_train, y_train = load_unit_test(split="train", return_X_y=True) >>> X_test, y_test = load_unit_test(split="test", return_X_y=True) >>> clf = IndividualBOSS() >>> clf.fit(X_train, y_train) IndividualBOSS(...) >>> y_pred = clf.predict(X_test) """ _tags = { "capability:multithreading": True, } def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=False, typed_dict=True, n_jobs=1, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.typed_dict = typed_dict self.n_jobs = n_jobs self.random_state = random_state self._transformer = None self._transformed_data = [] self._class_vals = [] self._accuracy = 0 self._subsample = [] self._train_predictions = [] super(IndividualBOSS, self).__init__() def __getstate__(self): """Return state as dictionary for pickling, required for typed Dict objects.""" state = self.__dict__.copy() if self.typed_dict: nl = [None] * len(self._transformed_data) for i, ndict in enumerate(state["_transformed_data"]): pdict = dict() for key, val in ndict.items(): pdict[key] = val nl[i] = pdict state["_transformed_data"] = nl return state def __setstate__(self, state): """Set current state using input pickling, required for typed Dict objects.""" self.__dict__.update(state) if self.typed_dict: nl = [None] * len(self._transformed_data) for i, pdict in enumerate(self._transformed_data): ndict = Dict.empty(key_type=types.int64, value_type=types.uint32) for key, val in pdict.items(): ndict[key] = val nl[i] = ndict self._transformed_data = nl def _fit(self, X, y): """Fit a single boss classifier on n_instances cases (X,y). Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self._transformer = SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, remove_repeat_words=True, bigrams=False, save_words=self.save_words, typed_dict=self.typed_dict, n_jobs=self._threads_to_use, ) sfa = self._transformer.fit_transform(X) self._transformed_data = sfa[0] self._class_vals = y return self def _predict(self, X): """Predict class values of all instances in X. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The data to make predictions for. Returns ------- y : array-like, shape = [n_instances] Predicted class labels. """ test_bags = self._transformer.transform(X) test_bags = test_bags[0] classes = Parallel(n_jobs=self._threads_to_use)( delayed(self._test_nn)( test_bag, ) for test_bag in test_bags ) return np.array(classes) def _test_nn(self, test_bag): rng = check_random_state(self.random_state) best_dist = sys.float_info.max nn = None for n, bag in enumerate(self._transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self._class_vals[n] return nn def _train_predict(self, train_num): test_bag = self._transformed_data[train_num] best_dist = sys.float_info.max nn = None for n, bag in enumerate(self._transformed_data): if n == train_num: continue dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist: best_dist = dist nn = self._class_vals[n] return nn def _shorten_bags(self, word_len): new_boss = IndividualBOSS( self.window_size, word_len, self.norm, self.alphabet_size, save_words=self.save_words, typed_dict=self.typed_dict, random_state=self.random_state, n_jobs=self.n_jobs, ) new_boss._transformer = self._transformer sfa = self._transformer._shorten_bags(word_len) new_boss._transformed_data = sfa[0] new_boss._class_vals = self._class_vals new_boss.n_classes_ = self.n_classes_ new_boss.classes_ = self.classes_ new_boss._class_dictionary = self._class_dictionary new_boss._threads_to_use = self._threads_to_use new_boss._is_fitted = True return new_boss def _clean(self): self._transformer.words = None self._transformer.save_words = False def _set_word_len(self, word_len): self.word_length = word_len self._transformer.word_length = word_len
def _fit(self, X, y): """Build a WEASEL+MUSE classifiers from the training set (X, y). Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ y = np.asarray(y) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] # add first order differences in each dimension to TS if self.use_first_order_differences: X = self._add_first_order_differences(X) # Window length parameter space dependent on series length self.col_names = X.columns rng = check_random_state(self.random_state) self.n_dims = len(self.col_names) self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1 self.highest_bits = np.zeros(self.n_dims) self.SFA_transformers = [[] for _ in range(self.n_dims)] # the words of all dimensions and all time series all_words = [dict() for _ in range(X.shape[0])] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) series_length = X_dim.shape[ -1] # TODO compute minimum over all ts ? # increment window size in steps of 'win_inc' win_inc = self._compute_window_inc(series_length) self.max_window = int(min(series_length, self.max_window)) if self.min_window > self.max_window: raise ValueError( f"Error in MUSE, min_window =" f"{self.min_window} is bigger" f" than max_window ={self.max_window}," f" series length is {self.series_length}" f" try set min_window to be smaller than series length in " f"the constructor, but the classifier may not work at " f"all with very short series") self.window_sizes.append( list(range(self.min_window, self.max_window, win_inc))) self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1 for window_size in self.window_sizes[ind]: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, binning_method=rng.choice(self.binning_strategies), bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X_dim, y) self.SFA_transformers[ind].append(transformer) bag = sfa_words[0] # chi-squared test to keep only relevant features relevant_features = {} apply_chi_squared = self.p_threshold < 1 if apply_chi_squared: vectorizer = DictVectorizer(sparse=True, dtype=np.int32, sort=False) bag_vec = vectorizer.fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features_idx = np.where(p <= self.p_threshold)[0] relevant_features = set( np.array( vectorizer.feature_names_)[relevant_features_idx]) # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefices to the words to # distinguish between window-sizes word = MUSE._shift_left(key, highest, ind, self.highest_dim_bit, window_size) all_words[j][word] = value self.clf = make_pipeline( DictVectorizer(sparse=True, sort=False), # StandardScaler(with_mean=True, copy=False), LogisticRegression( max_iter=5000, solver="liblinear", dual=True, # class_weight="balanced", penalty="l2", random_state=self.random_state, ), ) self.clf.fit(all_words, y) return self
class IndividualBOSS(BaseClassifier): """Single Bag of SFA Symbols (BOSS) classifier Bag of SFA Symbols Ensemble: implementation of BOSS from Schaffer : @article """ def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=True, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.random_state = random_state self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, remove_repeat_words=True, bigrams=False, save_words=save_words, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualBOSS, self).__init__() def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self def predict(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) rng = check_random_state(self.random_state) classes = [] test_bags = self.transformer.transform(X) test_bags = test_bags[0] # .iloc[:, 0] for test_bag in test_bags: best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self.class_vals[n] classes.append(nn) return np.array(classes) def predict_proba(self, X): preds = self.predict(X) dists = np.zeros((X.shape[0], self.num_classes)) for i in range(0, X.shape[0]): dists[i, self.class_dictionary.get(preds[i])] += 1 return dists def _train_predict(self, train_num): test_bag = self.transformed_data[train_num] best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): if n == train_num: continue dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist: best_dist = dist nn = self.class_vals[n] return nn def _shorten_bags(self, word_len): new_boss = IndividualBOSS( self.window_size, word_len, self.norm, self.alphabet_size, save_words=self.save_words, random_state=self.random_state, ) new_boss.transformer = self.transformer sfa = self.transformer._shorten_bags(word_len) new_boss.transformed_data = sfa[0] # .iloc[:, 0] new_boss.class_vals = self.class_vals new_boss.num_classes = self.num_classes new_boss.classes_ = self.classes_ new_boss.class_dictionary = self.class_dictionary new_boss._is_fitted = True return new_boss def _clean(self): self.transformer.words = None self.transformer.save_words = False def _set_word_len(self, word_len): self.word_length = word_len self.transformer.word_length = word_len
class IndividualBOSS(BaseClassifier): """Single Bag of SFA Symbols (BOSS) classifier. Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base classifier for the boss ensemble. """ def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=True, n_jobs=1, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.n_jobs = n_jobs self.random_state = random_state self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, remove_repeat_words=True, bigrams=False, save_words=save_words, n_jobs=n_jobs, ) self.transformed_data = [] self.accuracy = 0 self.subsample = [] self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualBOSS, self).__init__() def fit(self, X, y): """Fit a single boss classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X) self.transformed_data = sfa[0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self def predict(self, X): """Predict class values of all instances in X. Parameters ---------- X : pd.DataFrame of shape [n, 1] Returns ------- array of shape [n, 1] """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) test_bags = self.transformer.transform(X) test_bags = test_bags[0] classes = Parallel(n_jobs=self.n_jobs)( delayed(self._test_nn)(test_bag, ) for test_bag in test_bags) return np.array(classes) def predict_proba(self, X): """Predict class probabilities for all instances in X. Parameters ---------- X : pd.DataFrame of shape [n, 1] Returns ------- array of shape [n, self.n_classes] """ preds = self.predict(X) dists = np.zeros((X.shape[0], self.num_classes)) for i in range(0, X.shape[0]): dists[i, self.class_dictionary.get(preds[i])] += 1 return dists def _test_nn(self, test_bag): rng = check_random_state(self.random_state) best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self.class_vals[n] return nn def _train_predict(self, train_num): test_bag = self.transformed_data[train_num] best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): if n == train_num: continue dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist: best_dist = dist nn = self.class_vals[n] return nn def _shorten_bags(self, word_len): new_boss = IndividualBOSS( self.window_size, word_len, self.norm, self.alphabet_size, save_words=self.save_words, random_state=self.random_state, ) new_boss.transformer = self.transformer sfa = self.transformer._shorten_bags(word_len) new_boss.transformed_data = sfa[0] new_boss.class_vals = self.class_vals new_boss.num_classes = self.num_classes new_boss.classes_ = self.classes_ new_boss.class_dictionary = self.class_dictionary new_boss._is_fitted = True return new_boss def _clean(self): self.transformer.words = None self.transformer.save_words = False def _set_word_len(self, word_len): self.word_length = word_len self.transformer.word_length = word_len
class IndividualTDE(BaseClassifier): """Single TDE classifier, based off the Bag of SFA Symbols (BOSS) model""" def __init__( self, window_size=10, word_length=8, norm=False, levels=1, igb=False, alphabet_size=4, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.levels = levels self.igb = igb self.alphabet_size = alphabet_size self.random_state = random_state binning_method = "information-gain" if igb else "equi-depth" self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, levels=levels, binning_method=binning_method, bigrams=True, remove_repeat_words=True, save_words=False, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualTDE, self).__init__() def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self def predict(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) rng = check_random_state(self.random_state) classes = [] test_bags = self.transformer.transform(X) test_bags = test_bags[0] # .iloc[:, 0] for test_bag in test_bags: best_sim = -1 nn = None for n, bag in enumerate(self.transformed_data): sim = histogram_intersection(test_bag, bag) if sim > best_sim or (sim == best_sim and rng.random() < 0.5): best_sim = sim nn = self.class_vals[n] classes.append(nn) return np.array(classes) def predict_proba(self, X): preds = self.predict(X) dists = np.zeros((X.shape[0], self.num_classes)) for i in range(0, X.shape[0]): dists[i, self.class_dictionary.get(preds[i])] += 1 return dists def _train_predict(self, train_num): test_bag = self.transformed_data[train_num] best_sim = -1 nn = None for n, bag in enumerate(self.transformed_data): if n == train_num: continue sim = histogram_intersection(test_bag, bag) if sim > best_sim: best_sim = sim nn = self.class_vals[n] return nn
class IndividualBOSS(BaseClassifier): """Single bag of Symbolic Fourier Approximation Symbols (IndividualBOSS). Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base classifier for the boss ensemble. Implementation of single BOSS model from Schäfer (2015). [1]_ This is the underlying classifier for each classifier in the BOSS ensemble. Overview: input "n" series of length "m" and IndividualBoss performs a SFA transform to form a sparse dictionary of discretised words. The resulting dictionary is used with the BOSS distance function in a 1-nearest neighbor. Fit involves finding "n" histograms. Predict uses 1 nearest neighbor with a bespoke BOSS distance function. Parameters ---------- window_size : int Size of the window to use in BOSS algorithm. word_length : int Length of word to use to use in BOSS algorithm. norm : bool, default = False Whether to normalize words by dropping the first Fourier coefficient. alphabet_size : default = 4 Number of possible letters (values) for each word. save_words : bool, default = True Whether to keep NumPy array of words in SFA transformation even after the dictionary of words is returned. If True, the array is saved, which can shorten the time to calculate dictionaries using a shorter `word_length` (since the last "n" letters can be removed). n_jobs : int, default=1 The number of jobs to run in parallel for both `fit` and `predict`. ``-1`` means using all processors. random_state : int or None, default=None Seed for random, integer. Attributes ---------- n_classes : int Number of classes. Extracted from the data. n_instances : int Number of instances. Extracted from the data. n_estimators : int The final number of classifiers used. Will be <= `max_ensemble_size` if `max_ensemble_size` has been specified. series_length : int Length of all series (assumed equal). class_dictionary: dict Dictionary of classes. Extracted from the data. See Also -------- BOSSEnsemble, ContractableBOSS Notes ----- For the Java version, see `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/ tsml/classifiers/dictionary_based/BOSS.java>`_. References ---------- .. [1] Patrick Schäfer, "The BOSS is concerned with time series classification in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015 https://link.springer.com/article/10.1007/s10618-014-0377-7 Examples -------- >>> from sktime.classification.dictionary_based import IndividualBOSS >>> from sktime.datasets import load_italy_power_demand >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True) >>> X_test, y_test = load_italy_power_demand(split="test", return_X_y=True) >>> clf = IndividualBOSS() >>> clf.fit(X_train, y_train) IndividualBOSS(...) >>> y_pred = clf.predict(X_test) """ def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=True, n_jobs=1, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.n_jobs = n_jobs self.random_state = random_state self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, remove_repeat_words=True, bigrams=False, save_words=save_words, n_jobs=n_jobs, ) self.transformed_data = [] self.accuracy = 0 self.subsample = [] self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualBOSS, self).__init__() def fit(self, X, y): """Fit a single boss classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X) self.transformed_data = sfa[0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self def predict(self, X): """Predict class values of all instances in X. Parameters ---------- X : pd.DataFrame of shape [n, 1] Returns ------- array of shape [n, 1] """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) test_bags = self.transformer.transform(X) test_bags = test_bags[0] classes = Parallel(n_jobs=self.n_jobs)( delayed(self._test_nn)(test_bag, ) for test_bag in test_bags) return np.array(classes) def predict_proba(self, X): """Predict class probabilities for all instances in X. Parameters ---------- X : pd.DataFrame of shape [n, 1] Returns ------- dists : array of shape [n, self.n_classes] """ preds = self.predict(X) dists = np.zeros((X.shape[0], self.num_classes)) for i in range(0, X.shape[0]): dists[i, self.class_dictionary.get(preds[i])] += 1 return dists def _test_nn(self, test_bag): rng = check_random_state(self.random_state) best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self.class_vals[n] return nn def _train_predict(self, train_num): test_bag = self.transformed_data[train_num] best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): if n == train_num: continue dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist: best_dist = dist nn = self.class_vals[n] return nn def _shorten_bags(self, word_len): new_boss = IndividualBOSS( self.window_size, word_len, self.norm, self.alphabet_size, save_words=self.save_words, random_state=self.random_state, ) new_boss.transformer = self.transformer sfa = self.transformer._shorten_bags(word_len) new_boss.transformed_data = sfa[0] new_boss.class_vals = self.class_vals new_boss.num_classes = self.num_classes new_boss.classes_ = self.classes_ new_boss.class_dictionary = self.class_dictionary new_boss._is_fitted = True return new_boss def _clean(self): self.transformer.words = None self.transformer.save_words = False def _set_word_len(self, word_len): self.word_length = word_len self.transformer.word_length = word_len