Exemple #1
0
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=True,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.random_state = random_state

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=save_words,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualBOSS, self).__init__()
Exemple #2
0
        def _parallel_fit(window_size, ):
            rng = check_random_state(window_size)
            all_words = [dict() for x in range(len(X))]
            relevant_features_count = 0

            # for window_size in self.window_sizes:
            transformer = SFA(
                word_length=rng.choice(self.word_lengths),
                alphabet_size=self.alphabet_size,
                window_size=window_size,
                norm=rng.choice(self.norm_options),
                anova=self.anova,
                # levels=rng.choice([1, 2, 3]),
                binning_method=self.binning_strategy,
                bigrams=self.bigrams,
                remove_repeat_words=False,
                lower_bounding=False,
                save_words=False,
            )

            sfa_words = transformer.fit_transform(X, y)

            # self.SFA_transformers.append(transformer)
            bag = sfa_words[0]
            apply_chi_squared = self.p_threshold < 1

            # chi-squared test to keep only relevant features
            if apply_chi_squared:
                vectorizer = DictVectorizer(sparse=True,
                                            dtype=np.int32,
                                            sort=False)
                bag_vec = vectorizer.fit_transform(bag)

                chi2_statistics, p = chi2(bag_vec, y)
                relevant_features_idx = np.where(p <= self.p_threshold)[0]
                relevant_features = set(
                    np.array(vectorizer.feature_names_)[relevant_features_idx])
                relevant_features_count += len(relevant_features_idx)

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # chi-squared test
                        if (not apply_chi_squared) or (key
                                                       in relevant_features):
                            # append the prefixes to the words to
                            # distinguish between window-sizes
                            word = WEASEL._shift_left(key, self.highest_bit,
                                                      window_size)
                            all_words[j][word] = value

                return all_words, transformer, relevant_features_count
Exemple #3
0
    def _select_dims(self, X, y):
        self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1
        accs = []
        transformers = []

        # select dimensions based on reduced bag size accuracy
        for i in range(self.n_dims):
            self.dims.append(i)
            transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    save_words=False,
                    save_binning_dft=True,
                    n_jobs=self.n_jobs,
                ))

            X_dim = X[:, i, :].reshape(self.n_instances, 1, self.series_length)

            transformers[i].fit(X_dim, y)
            sfa = transformers[i].transform(
                X_dim,
                y,
                transformers[i].binning_dft,
            )
            transformers[i].binning_dft = None

            correct = 0
            for i in range(self.n_instances):
                if self._train_predict(i, sfa[0]) == y[i]:
                    correct = correct + 1

            accs.append(correct)

        max_acc = max(accs)

        dims = []
        fin_transformers = []
        for i in range(self.n_dims):
            if accs[i] >= max_acc * self.dim_threshold:
                dims.append(i)
                fin_transformers.append(transformers[i])

        if len(dims) > self.max_dims:
            idx = self.random_state.choice(
                len(dims),
                self.max_dims,
                replace=False,
            ).tolist()
            dims = [dims[i] for i in idx]
            fin_transformers = [fin_transformers[i] for i in idx]

        return dims, fin_transformers
Exemple #4
0
    def _fit(self, X, y):
        """Fit a single base TDE classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self.n_instances_, self.n_dims_, self.series_length_ = X.shape
        self._class_vals = y

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims_ > 1:
            self._dims, self._transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances_)]

            for i, dim in enumerate(self._dims):
                X_dim = X[:, dim, :].reshape(self.n_instances_, 1,
                                             self.series_length_)
                dim_words = self._transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for n in range(self.n_instances_):
                    for word, count in dim_words[n].items():
                        words[n][word << self._highest_dim_bit | dim] = count

            self._transformed_data = words
        else:
            self._transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    lower_bounding=False,
                    save_words=False,
                    use_fallback_dft=True,
                    n_jobs=self._threads_to_use,
                ))
            sfa = self._transformers[0].fit_transform(X, y)
            self._transformed_data = sfa[0]
Exemple #5
0
    def fit(self, X, y):
        """Fit a single TD classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, coerce_to_numpy=True)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims > 1:
            self.dims, self.transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances)]

            for i, dim in enumerate(self.dims):
                X_dim = X[:, dim, :].reshape(self.n_instances, 1,
                                             self.series_length)
                dim_words = self.transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for i in range(self.n_instances):
                    for word, count in dim_words[i].items():
                        words[i][word << self.highest_dim_bit | dim] = count

            self.transformed_data = words
        else:
            self.transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    save_words=False,
                    n_jobs=self.n_jobs,
                ))
            sfa = self.transformers[0].fit_transform(X, y)
            self.transformed_data = sfa[0]

        self._is_fitted = True
        return self
Exemple #6
0
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        levels=1,
        igb=False,
        alphabet_size=4,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.levels = levels
        self.igb = igb
        self.alphabet_size = alphabet_size

        self.random_state = random_state

        binning_method = "information-gain" if igb else "equi-depth"

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            levels=levels,
            binning_method=binning_method,
            bigrams=True,
            remove_repeat_words=True,
            save_words=False,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualTDE, self).__init__()
Exemple #7
0
    def _fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self._transformer = SFA(
            word_length=self.word_length,
            alphabet_size=self.alphabet_size,
            window_size=self.window_size,
            norm=self.norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=self.save_words,
            typed_dict=self.typed_dict,
            n_jobs=self._threads_to_use,
        )

        sfa = self._transformer.fit_transform(X)
        self._transformed_data = sfa[0]
        self._class_vals = y

        return self
Exemple #8
0
    def _fit(self, X, y):
        self._n_jobs = check_n_jobs(self.n_jobs)

        self.n_instances, self.n_dims, self.series_length = X.shape
        self._class_vals = y
        self.n_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self._class_dictionary[classVal] = index

        # select dimensions using accuracy estimate if multivariate
        if self.n_dims > 1:
            self._dims, self._transformers = self._select_dims(X, y)

            words = [defaultdict(int) for _ in range(self.n_instances)]

            for i, dim in enumerate(self._dims):
                X_dim = X[:, dim, :].reshape(self.n_instances, 1,
                                             self.series_length)
                dim_words = self._transformers[i].transform(X_dim, y)
                dim_words = dim_words[0]

                for n in range(self.n_instances):
                    for word, count in dim_words[n].items():
                        words[n][word << self._highest_dim_bit | dim] = count

            self._transformed_data = words
        else:
            self._transformers.append(
                SFA(
                    word_length=self.word_length,
                    alphabet_size=self.alphabet_size,
                    window_size=self.window_size,
                    norm=self.norm,
                    levels=self.levels,
                    binning_method="information-gain"
                    if self.igb else "equi-depth",
                    bigrams=self.bigrams,
                    remove_repeat_words=True,
                    lower_bounding=False,
                    save_words=False,
                    use_fallback_dft=True,
                    n_jobs=self._n_jobs,
                ))
            sfa = self._transformers[0].fit_transform(X, y)
            self._transformed_data = sfa[0]
Exemple #9
0
class IndividualBOSS(BaseClassifier):
    """Single bag of Symbolic Fourier Approximation Symbols (IndividualBOSS).

    Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base
    classifier for the boss ensemble.

    Implementation of single BOSS model from Schäfer (2015). [1]_

    This is the underlying classifier for each classifier in the BOSS ensemble.

    Overview: input "n" series of length "m" and IndividualBoss performs a SFA
    transform to form a sparse dictionary of discretised words. The resulting
    dictionary is used with the BOSS distance function in a 1-nearest neighbor.

    Fit involves finding "n" histograms.

    Predict uses 1 nearest neighbor with a bespoke BOSS distance function.

    Parameters
    ----------
    window_size : int
        Size of the window to use in BOSS algorithm.
    word_length : int
        Length of word to use to use in BOSS algorithm.
    norm : bool, default = False
        Whether to normalize words by dropping the first Fourier coefficient.
    alphabet_size : default = 4
        Number of possible letters (values) for each word.
    save_words : bool, default = True
        Whether to keep NumPy array of words in SFA transformation even after
        the dictionary of words is returned. If True, the array is saved, which
        can shorten the time to calculate dictionaries using a shorter
        `word_length` (since the last "n" letters can be removed).
    typed_dict : bool, default=True
        Use a numba TypedDict to store word counts. May increase memory usage, but will
        be faster for larger datasets.
    n_jobs : int, default=1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``-1`` means using all processors.
    random_state : int or None, default=None
        Seed for random, integer.

    Attributes
    ----------
    n_classes_ : int
        Number of classes. Extracted from the data.
    classes_ : list
        The classes labels.

    See Also
    --------
    BOSSEnsemble, ContractableBOSS

    Notes
    -----
    For the Java version, see
    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
    tsml/classifiers/dictionary_based/IndividualBOSS.java>`_.

    References
    ----------
    .. [1] Patrick Schäfer, "The BOSS is concerned with time series classification
       in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015
       https://link.springer.com/article/10.1007/s10618-014-0377-7

    Examples
    --------
    >>> from sktime.classification.dictionary_based import IndividualBOSS
    >>> from sktime.datasets import load_unit_test
    >>> X_train, y_train = load_unit_test(split="train", return_X_y=True)
    >>> X_test, y_test = load_unit_test(split="test", return_X_y=True)
    >>> clf = IndividualBOSS()
    >>> clf.fit(X_train, y_train)
    IndividualBOSS(...)
    >>> y_pred = clf.predict(X_test)
    """

    _tags = {
        "capability:multithreading": True,
    }

    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=False,
        typed_dict=True,
        n_jobs=1,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.typed_dict = typed_dict
        self.n_jobs = n_jobs
        self.random_state = random_state

        self._transformer = None
        self._transformed_data = []
        self._class_vals = []
        self._accuracy = 0
        self._subsample = []
        self._train_predictions = []

        super(IndividualBOSS, self).__init__()

    def __getstate__(self):
        """Return state as dictionary for pickling, required for typed Dict objects."""
        state = self.__dict__.copy()
        if self.typed_dict:
            nl = [None] * len(self._transformed_data)
            for i, ndict in enumerate(state["_transformed_data"]):
                pdict = dict()
                for key, val in ndict.items():
                    pdict[key] = val
                nl[i] = pdict
            state["_transformed_data"] = nl
        return state

    def __setstate__(self, state):
        """Set current state using input pickling, required for typed Dict objects."""
        self.__dict__.update(state)
        if self.typed_dict:
            nl = [None] * len(self._transformed_data)
            for i, pdict in enumerate(self._transformed_data):
                ndict = Dict.empty(key_type=types.int64, value_type=types.uint32)
                for key, val in pdict.items():
                    ndict[key] = val
                nl[i] = ndict
            self._transformed_data = nl

    def _fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        self._transformer = SFA(
            word_length=self.word_length,
            alphabet_size=self.alphabet_size,
            window_size=self.window_size,
            norm=self.norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=self.save_words,
            typed_dict=self.typed_dict,
            n_jobs=self._threads_to_use,
        )

        sfa = self._transformer.fit_transform(X)
        self._transformed_data = sfa[0]
        self._class_vals = y

        return self

    def _predict(self, X):
        """Predict class values of all instances in X.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The data to make predictions for.

        Returns
        -------
        y : array-like, shape = [n_instances]
            Predicted class labels.
        """
        test_bags = self._transformer.transform(X)
        test_bags = test_bags[0]

        classes = Parallel(n_jobs=self._threads_to_use)(
            delayed(self._test_nn)(
                test_bag,
            )
            for test_bag in test_bags
        )

        return np.array(classes)

    def _test_nn(self, test_bag):
        rng = check_random_state(self.random_state)

        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self._transformed_data):
            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist or (dist == best_dist and rng.random() < 0.5):
                best_dist = dist
                nn = self._class_vals[n]

        return nn

    def _train_predict(self, train_num):
        test_bag = self._transformed_data[train_num]
        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self._transformed_data):
            if n == train_num:
                continue

            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist:
                best_dist = dist
                nn = self._class_vals[n]

        return nn

    def _shorten_bags(self, word_len):
        new_boss = IndividualBOSS(
            self.window_size,
            word_len,
            self.norm,
            self.alphabet_size,
            save_words=self.save_words,
            typed_dict=self.typed_dict,
            random_state=self.random_state,
            n_jobs=self.n_jobs,
        )
        new_boss._transformer = self._transformer
        sfa = self._transformer._shorten_bags(word_len)
        new_boss._transformed_data = sfa[0]

        new_boss._class_vals = self._class_vals
        new_boss.n_classes_ = self.n_classes_
        new_boss.classes_ = self.classes_
        new_boss._class_dictionary = self._class_dictionary

        new_boss._threads_to_use = self._threads_to_use
        new_boss._is_fitted = True
        return new_boss

    def _clean(self):
        self._transformer.words = None
        self._transformer.save_words = False

    def _set_word_len(self, word_len):
        self.word_length = word_len
        self._transformer.word_length = word_len
Exemple #10
0
    def _fit(self, X, y):
        """Build a WEASEL+MUSE classifiers from the training set (X, y).

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        y = np.asarray(y)
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]

        # add first order differences in each dimension to TS
        if self.use_first_order_differences:
            X = self._add_first_order_differences(X)

        # Window length parameter space dependent on series length
        self.col_names = X.columns

        rng = check_random_state(self.random_state)

        self.n_dims = len(self.col_names)
        self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1
        self.highest_bits = np.zeros(self.n_dims)

        self.SFA_transformers = [[] for _ in range(self.n_dims)]

        # the words of all dimensions and all time series
        all_words = [dict() for _ in range(X.shape[0])]

        # On each dimension, perform SFA
        for ind, column in enumerate(self.col_names):
            X_dim = X[[column]]
            X_dim = from_nested_to_3d_numpy(X_dim)
            series_length = X_dim.shape[
                -1]  # TODO compute minimum over all ts ?

            # increment window size in steps of 'win_inc'
            win_inc = self._compute_window_inc(series_length)

            self.max_window = int(min(series_length, self.max_window))
            if self.min_window > self.max_window:
                raise ValueError(
                    f"Error in MUSE, min_window ="
                    f"{self.min_window} is bigger"
                    f" than max_window ={self.max_window},"
                    f" series length is {self.series_length}"
                    f" try set min_window to be smaller than series length in "
                    f"the constructor, but the classifier may not work at "
                    f"all with very short series")
            self.window_sizes.append(
                list(range(self.min_window, self.max_window, win_inc)))

            self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1

            for window_size in self.window_sizes[ind]:

                transformer = SFA(
                    word_length=rng.choice(self.word_lengths),
                    alphabet_size=self.alphabet_size,
                    window_size=window_size,
                    norm=rng.choice(self.norm_options),
                    anova=self.anova,
                    binning_method=rng.choice(self.binning_strategies),
                    bigrams=self.bigrams,
                    remove_repeat_words=False,
                    lower_bounding=False,
                    save_words=False,
                )

                sfa_words = transformer.fit_transform(X_dim, y)

                self.SFA_transformers[ind].append(transformer)
                bag = sfa_words[0]

                # chi-squared test to keep only relevant features
                relevant_features = {}
                apply_chi_squared = self.p_threshold < 1
                if apply_chi_squared:
                    vectorizer = DictVectorizer(sparse=True,
                                                dtype=np.int32,
                                                sort=False)
                    bag_vec = vectorizer.fit_transform(bag)

                    chi2_statistics, p = chi2(bag_vec, y)
                    relevant_features_idx = np.where(p <= self.p_threshold)[0]
                    relevant_features = set(
                        np.array(
                            vectorizer.feature_names_)[relevant_features_idx])

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                highest = np.int32(self.highest_bits[ind])
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # chi-squared test
                        if (not apply_chi_squared) or (key
                                                       in relevant_features):
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = MUSE._shift_left(key, highest, ind,
                                                    self.highest_dim_bit,
                                                    window_size)
                            all_words[j][word] = value

        self.clf = make_pipeline(
            DictVectorizer(sparse=True, sort=False),
            # StandardScaler(with_mean=True, copy=False),
            LogisticRegression(
                max_iter=5000,
                solver="liblinear",
                dual=True,
                # class_weight="balanced",
                penalty="l2",
                random_state=self.random_state,
            ),
        )

        self.clf.fit(all_words, y)
        return self
Exemple #11
0
class IndividualBOSS(BaseClassifier):
    """Single Bag of SFA Symbols (BOSS) classifier

    Bag of SFA Symbols Ensemble: implementation of BOSS from Schaffer :
    @article
    """

    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=True,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.random_state = random_state

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=save_words,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualBOSS, self).__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]  # .iloc[:, 0]

        for test_bag in test_bags:
            best_dist = sys.float_info.max
            nn = None

            for n, bag in enumerate(self.transformed_data):
                dist = boss_distance(test_bag, bag, best_dist)

                if dist < best_dist or (dist == best_dist and rng.random() < 0.5):
                    best_dist = dist
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)

    def predict_proba(self, X):
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist:
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _shorten_bags(self, word_len):
        new_boss = IndividualBOSS(
            self.window_size,
            word_len,
            self.norm,
            self.alphabet_size,
            save_words=self.save_words,
            random_state=self.random_state,
        )
        new_boss.transformer = self.transformer
        sfa = self.transformer._shorten_bags(word_len)
        new_boss.transformed_data = sfa[0]  # .iloc[:, 0]

        new_boss.class_vals = self.class_vals
        new_boss.num_classes = self.num_classes
        new_boss.classes_ = self.classes_
        new_boss.class_dictionary = self.class_dictionary

        new_boss._is_fitted = True
        return new_boss

    def _clean(self):
        self.transformer.words = None
        self.transformer.save_words = False

    def _set_word_len(self, word_len):
        self.word_length = word_len
        self.transformer.word_length = word_len
Exemple #12
0
class IndividualBOSS(BaseClassifier):
    """Single Bag of SFA Symbols (BOSS) classifier.

    Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base
    classifier for the boss ensemble.
    """
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=True,
        n_jobs=1,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=save_words,
            n_jobs=n_jobs,
        )
        self.transformed_data = []
        self.accuracy = 0
        self.subsample = []

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualBOSS, self).__init__()

    def fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa[0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        """Predict class values of all instances in X.

        Parameters
        ----------
        X : pd.DataFrame of shape [n, 1]

        Returns
        -------
        array of shape [n, 1]
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]

        classes = Parallel(n_jobs=self.n_jobs)(
            delayed(self._test_nn)(test_bag, ) for test_bag in test_bags)

        return np.array(classes)

    def predict_proba(self, X):
        """Predict class probabilities for all instances in X.

        Parameters
        ----------
        X : pd.DataFrame of shape [n, 1]

        Returns
        -------
        array of shape [n, self.n_classes]
        """
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _test_nn(self, test_bag):
        rng = check_random_state(self.random_state)

        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist or (dist == best_dist and rng.random() < 0.5):
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist:
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _shorten_bags(self, word_len):
        new_boss = IndividualBOSS(
            self.window_size,
            word_len,
            self.norm,
            self.alphabet_size,
            save_words=self.save_words,
            random_state=self.random_state,
        )
        new_boss.transformer = self.transformer
        sfa = self.transformer._shorten_bags(word_len)
        new_boss.transformed_data = sfa[0]

        new_boss.class_vals = self.class_vals
        new_boss.num_classes = self.num_classes
        new_boss.classes_ = self.classes_
        new_boss.class_dictionary = self.class_dictionary

        new_boss._is_fitted = True
        return new_boss

    def _clean(self):
        self.transformer.words = None
        self.transformer.save_words = False

    def _set_word_len(self, word_len):
        self.word_length = word_len
        self.transformer.word_length = word_len
Exemple #13
0
class IndividualTDE(BaseClassifier):
    """Single TDE classifier, based off the Bag of SFA Symbols (BOSS) model"""
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        levels=1,
        igb=False,
        alphabet_size=4,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.levels = levels
        self.igb = igb
        self.alphabet_size = alphabet_size

        self.random_state = random_state

        binning_method = "information-gain" if igb else "equi-depth"

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            levels=levels,
            binning_method=binning_method,
            bigrams=True,
            remove_repeat_words=True,
            save_words=False,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualTDE, self).__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]  # .iloc[:, 0]

        for test_bag in test_bags:
            best_sim = -1
            nn = None

            for n, bag in enumerate(self.transformed_data):
                sim = histogram_intersection(test_bag, bag)

                if sim > best_sim or (sim == best_sim and rng.random() < 0.5):
                    best_sim = sim
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)

    def predict_proba(self, X):
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_sim = -1
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            sim = histogram_intersection(test_bag, bag)

            if sim > best_sim:
                best_sim = sim
                nn = self.class_vals[n]

        return nn
Exemple #14
0
class IndividualBOSS(BaseClassifier):
    """Single bag of Symbolic Fourier Approximation Symbols (IndividualBOSS).

    Bag of SFA Symbols Ensemble: implementation of a single BOSS Schaffer, the base
    classifier for the boss ensemble.

    Implementation of single BOSS model from Schäfer (2015). [1]_

    This is the underlying classifier for each classifier in the BOSS ensemble.

    Overview: input "n" series of length "m" and IndividualBoss performs a SFA
    transform to form a sparse dictionary of discretised words. The resulting
    dictionary is used with the BOSS distance function in a 1-nearest neighbor.

    Fit involves finding "n" histograms.

    Predict uses 1 nearest neighbor with a bespoke BOSS distance function.

    Parameters
    ----------
    window_size : int
        Size of the window to use in BOSS algorithm.
    word_length : int
        Length of word to use to use in BOSS algorithm.
    norm : bool, default = False
        Whether to normalize words by dropping the first Fourier coefficient.
    alphabet_size : default = 4
        Number of possible letters (values) for each word.
    save_words : bool, default = True
        Whether to keep NumPy array of words in SFA transformation even after
        the dictionary of words is returned. If True, the array is saved, which
        can shorten the time to calculate dictionaries using a shorter
        `word_length` (since the last "n" letters can be removed).
    n_jobs : int, default=1
        The number of jobs to run in parallel for both `fit` and `predict`.
        ``-1`` means using all processors.
    random_state : int or None, default=None
        Seed for random, integer.

    Attributes
    ----------
    n_classes : int
        Number of classes. Extracted from the data.
    n_instances : int
        Number of instances. Extracted from the data.
    n_estimators : int
        The final number of classifiers used. Will be <= `max_ensemble_size` if
        `max_ensemble_size` has been specified.
    series_length : int
        Length of all series (assumed equal).
    class_dictionary: dict
        Dictionary of classes. Extracted from the data.

    See Also
    --------
    BOSSEnsemble, ContractableBOSS

    Notes
    -----
    For the Java version, see
    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/
    tsml/classifiers/dictionary_based/BOSS.java>`_.

    References
    ----------
    .. [1] Patrick Schäfer, "The BOSS is concerned with time series classification
       in the presence of noise", Data Mining and Knowledge Discovery, 29(6): 2015
       https://link.springer.com/article/10.1007/s10618-014-0377-7

    Examples
    --------
    >>> from sktime.classification.dictionary_based import IndividualBOSS
    >>> from sktime.datasets import load_italy_power_demand
    >>> X_train, y_train = load_italy_power_demand(split="train", return_X_y=True)
    >>> X_test, y_test = load_italy_power_demand(split="test", return_X_y=True)
    >>> clf = IndividualBOSS()
    >>> clf.fit(X_train, y_train)
    IndividualBOSS(...)
    >>> y_pred = clf.predict(X_test)
    """
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=True,
        n_jobs=1,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=save_words,
            n_jobs=n_jobs,
        )
        self.transformed_data = []
        self.accuracy = 0
        self.subsample = []

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualBOSS, self).__init__()

    def fit(self, X, y):
        """Fit a single boss classifier on n_instances cases (X,y).

        Parameters
        ----------
        X : pd.DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa[0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        """Predict class values of all instances in X.

        Parameters
        ----------
        X : pd.DataFrame of shape [n, 1]

        Returns
        -------
        array of shape [n, 1]
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]

        classes = Parallel(n_jobs=self.n_jobs)(
            delayed(self._test_nn)(test_bag, ) for test_bag in test_bags)

        return np.array(classes)

    def predict_proba(self, X):
        """Predict class probabilities for all instances in X.

        Parameters
        ----------
        X : pd.DataFrame of shape [n, 1]

        Returns
        -------
        dists : array of shape [n, self.n_classes]
        """
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _test_nn(self, test_bag):
        rng = check_random_state(self.random_state)

        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist or (dist == best_dist and rng.random() < 0.5):
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist:
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _shorten_bags(self, word_len):
        new_boss = IndividualBOSS(
            self.window_size,
            word_len,
            self.norm,
            self.alphabet_size,
            save_words=self.save_words,
            random_state=self.random_state,
        )
        new_boss.transformer = self.transformer
        sfa = self.transformer._shorten_bags(word_len)
        new_boss.transformed_data = sfa[0]

        new_boss.class_vals = self.class_vals
        new_boss.num_classes = self.num_classes
        new_boss.classes_ = self.classes_
        new_boss.class_dictionary = self.class_dictionary

        new_boss._is_fitted = True
        return new_boss

    def _clean(self):
        self.transformer.words = None
        self.transformer.save_words = False

    def _set_word_len(self, word_len):
        self.word_length = word_len
        self.transformer.word_length = word_len