Exemple #1
0
    def __init__(self,
                 window_size=10,
                 word_length=8,
                 norm=False,
                 alphabet_size=4,
                 save_words=True,
                 random_state=None
                 ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.random_state = random_state

        self.transformer = SFA(word_length=word_length,
                               alphabet_size=alphabet_size,
                               window_size=window_size, norm=norm,
                               remove_repeat_words=True,
                               bigrams=False,
                               save_words=save_words)
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(BOSSIndividual, self).__init__()
Exemple #2
0
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        levels=1,
        igb=False,
        alphabet_size=4,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.levels = levels
        self.igb = igb
        self.alphabet_size = alphabet_size

        self.random_state = random_state

        binning_method = "information-gain" if igb else "equi-depth"

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            levels=levels,
            binning_method=binning_method,
            bigrams=True,
            remove_repeat_words=True,
            return_pandas_data_series=False,
            save_words=False,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualTDE, self).__init__()
Exemple #3
0
    def fit(self, X, y):
        """Build a WEASEL+MUSE classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, coerce_to_pandas=True)
        y = np.asarray(y)

        # add first order differences in each dimension to TS
        if self.use_first_order_differences:
            X = self.add_first_order_differences(X)

        # Window length parameter space dependent on series length
        self.col_names = X.columns

        rng = check_random_state(self.random_state)

        self.n_dims = len(self.col_names)
        self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1
        self.highest_bits = np.zeros(self.n_dims)

        self.SFA_transformers = [[] for _ in range(self.n_dims)]

        # the words of all dimensions and all time series
        all_words = [dict() for _ in range(X.shape[0])]

        # On each dimension, perform SFA
        for ind, column in enumerate(self.col_names):
            X_dim = X[[column]]
            X_dim = from_nested_to_3d_numpy(X_dim)
            series_length = X_dim.shape[
                -1]  # TODO compute minimum over all ts ?

            # increment window size in steps of 'win_inc'
            win_inc = self.compute_window_inc(series_length)

            self.max_window = int(min(series_length, self.max_window))
            self.window_sizes.append(
                list(range(self.min_window, self.max_window, win_inc)))

            self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1

            for window_size in self.window_sizes[ind]:

                transformer = SFA(
                    word_length=rng.choice(self.word_lengths),
                    alphabet_size=self.alphabet_size,
                    window_size=window_size,
                    norm=rng.choice(self.norm_options),
                    anova=self.anova,
                    binning_method=rng.choice(self.binning_strategies),
                    bigrams=self.bigrams,
                    remove_repeat_words=False,
                    lower_bounding=False,
                    save_words=False,
                )

                sfa_words = transformer.fit_transform(X_dim, y)

                self.SFA_transformers[ind].append(transformer)
                bag = sfa_words[0]  # .iloc[:, 0]

                # chi-squared test to keep only relevant features
                relevant_features = {}
                apply_chi_squared = self.chi2_threshold > 0
                if apply_chi_squared:
                    bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                    chi2_statistics, p = chi2(bag_vec, y)
                    relevant_features = np.where(
                        chi2_statistics >= self.chi2_threshold)[0]

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                highest = np.int32(self.highest_bits[ind])
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # chi-squared test
                        if (not apply_chi_squared) or (key
                                                       in relevant_features):
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = MUSE.shift_left(key, highest, ind,
                                                   self.highest_dim_bit,
                                                   window_size)

                            all_words[j][word] = value

        self.clf = make_pipeline(
            DictVectorizer(sparse=False),
            StandardScaler(with_mean=True, copy=False),
            LogisticRegression(
                max_iter=5000,
                solver="liblinear",
                dual=True,
                # class_weight="balanced",
                penalty="l2",
                random_state=self.random_state,
            ),
        )

        self.clf.fit(all_words, y)
        self._is_fitted = True
        return self
Exemple #4
0
    def fit(self, X, y):
        """Build a WEASEL classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        # Window length parameter space dependent on series length
        self.n_instances, self.series_length = X.shape[0], X.shape[-1]

        win_inc = self.compute_window_inc()

        self.max_window = int(min(self.series_length, self.max_window))
        self.window_sizes = list(
            range(self.min_window, self.max_window, win_inc))

        self.highest_bit = (math.ceil(math.log2(self.max_window))) + 1
        rng = check_random_state(self.random_state)

        all_words = [dict() for x in range(len(X))]

        for window_size in self.window_sizes:

            transformer = SFA(
                word_length=rng.choice(self.word_lengths),
                alphabet_size=self.alphabet_size,
                window_size=window_size,
                norm=rng.choice(self.norm_options),
                anova=self.anova,
                # levels=rng.choice([1, 2, 3]),
                binning_method=self.binning_strategy,
                bigrams=self.bigrams,
                remove_repeat_words=False,
                lower_bounding=False,
                save_words=False,
            )

            sfa_words = transformer.fit_transform(X, y)

            self.SFA_transformers.append(transformer)
            bag = sfa_words[0]  # .iloc[:, 0]

            # chi-squared test to keep only relevant features
            relevant_features = {}
            apply_chi_squared = self.chi2_threshold > 0
            if apply_chi_squared:
                bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                chi2_statistics, p = chi2(bag_vec, y)
                relevant_features = np.where(
                    chi2_statistics >= self.chi2_threshold)[0]

            # merging bag-of-patterns of different window_sizes
            # to single bag-of-patterns with prefix indicating
            # the used window-length
            for j in range(len(bag)):
                for (key, value) in bag[j].items():
                    # chi-squared test
                    if (not apply_chi_squared) or (key in relevant_features):
                        # append the prefices to the words to
                        # distinguish between window-sizes
                        if isinstance(key, tuple):
                            word = (((key[0] << self.highest_bit) | key[1]) <<
                                    3) | window_size
                        else:
                            # word = ((key << self.highest_bit) << 3) \
                            #        | window_size
                            word = WEASEL.shift_left(key, self.highest_bit,
                                                     window_size)

                        all_words[j][word] = value

        self.clf = make_pipeline(
            DictVectorizer(sparse=False),
            StandardScaler(with_mean=True, copy=False),
            LogisticRegression(
                max_iter=5000,
                solver="liblinear",
                dual=True,
                # class_weight="balanced",
                penalty="l2",
                random_state=self.random_state,
            ),
        )

        self.clf.fit(all_words, y)
        self._is_fitted = True
        return self
Exemple #5
0
class IndividualTDE(BaseClassifier):
    """ Single TDE classifier, based off the Bag of SFA Symbols (BOSS) model
    """
    def __init__(self,
                 window_size=10,
                 word_length=8,
                 norm=False,
                 levels=1,
                 igb=False,
                 alphabet_size=4,
                 random_state=None):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.levels = levels
        self.igb = igb
        self.alphabet_size = alphabet_size

        self.random_state = random_state

        binning_method = "information-gain" if igb else "equi-depth"

        self.transformer = SFA(word_length=word_length,
                               alphabet_size=alphabet_size,
                               window_size=window_size,
                               norm=norm,
                               levels=levels,
                               binning_method=binning_method,
                               bigrams=True,
                               remove_repeat_words=True,
                               save_words=False)
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualTDE, self).__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = [series.to_dict() for series in sfa.iloc[:, 0]]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = [series.to_dict() for series in test_bags.iloc[:, 0]]

        for i, test_bag in enumerate(test_bags):
            best_sim = -1
            nn = None

            for n, bag in enumerate(self.transformed_data):
                sim = histogram_intersection(test_bag, bag)

                if sim > best_sim or (sim == best_sim and rng.random() < 0.5):
                    best_sim = sim
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)

    def predict_proba(self, X):
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_sim = -1
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            sim = histogram_intersection(test_bag, bag)

            if sim > best_sim:
                best_sim = sim
                nn = self.class_vals[n]

        return nn
Exemple #6
0
class BOSSIndividual(BaseClassifier):
    """ Single Bag of SFA Symbols (BOSS) classifier

    Bag of SFA Symbols Ensemble: implementation of BOSS from Schaffer :
    @article
    """

    def __init__(self,
                 window_size=10,
                 word_length=8,
                 norm=False,
                 alphabet_size=4,
                 save_words=True,
                 random_state=None
                 ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.random_state = random_state

        self.transformer = SFA(word_length=word_length,
                               alphabet_size=alphabet_size,
                               window_size=window_size, norm=norm,
                               remove_repeat_words=True,
                               bigrams=False,
                               save_words=save_words)
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(BOSSIndividual, self).__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa.iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags.iloc[:, 0]

        for i, test_bag in enumerate(test_bags):
            best_dist = sys.float_info.max
            nn = None

            for n, bag in enumerate(self.transformed_data):
                dist = boss_distance(test_bag, bag, best_dist)

                if dist < best_dist or (dist == best_dist and rng.random()
                                        < 0.5):
                    best_dist = dist
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)

    def predict_proba(self, X):
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist:
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _shorten_bags(self, word_len):
        new_boss = BOSSIndividual(self.window_size, word_len,
                                  self.norm, self.alphabet_size,
                                  save_words=self.save_words,
                                  random_state=self.random_state)
        new_boss.transformer = self.transformer
        sfa = self.transformer._shorten_bags(word_len)
        new_boss.transformed_data = sfa.iloc[:, 0]

        new_boss.class_vals = self.class_vals
        new_boss.num_classes = self.num_classes
        new_boss.classes_ = self.classes_
        new_boss.class_dictionary = self.class_dictionary

        new_boss._is_fitted = True
        return new_boss

    def _clean(self):
        self.transformer.words = None
        self.transformer.save_words = False

    def _set_word_len(self, word_len):
        self.word_length = word_len
        self.transformer.word_length = word_len
Exemple #7
0
    def fit(self, X, y):
        """Build a WEASEL classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, enforce_univariate=True)
        y = y.values if isinstance(y, pd.Series) else y

        # Window length parameter space dependent on series length
        self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0])
        self.max_window = min(self.series_length, self.max_window)
        self.window_sizes = list(
            range(self.min_window, self.max_window, self.win_inc))

        max_acc = -1
        self.highest_bit = (math.ceil(math.log2(self.max_window)) + 1)

        final_bag_vec = None

        for norm in self.norm_options:
            # transformers = []

            for w, word_length in enumerate(self.word_lengths):
                all_words = [dict() for x in range(len(X))]
                transformers = []

                for i, window_size in enumerate(self.window_sizes):
                    # if w == 0:  # only compute once, otherwise shorten
                    transformer = SFA(word_length=np.max(word_length),
                                      alphabet_size=self.alphabet_size,
                                      window_size=window_size,
                                      norm=norm,
                                      anova=self.anova,
                                      binning_method=self.binning_strategy,
                                      bigrams=self.bigrams,
                                      remove_repeat_words=False,
                                      lower_bounding=False,
                                      save_words=False)
                    sfa_words = transformer.fit_transform(X, y)
                    transformers.append(transformer)

                    # use the shortening of words trick
                    # sfa_words = transformers[i]._shorten_bags(word_length)

                    # TODO refactor? dicts not really needed here ...
                    bag = sfa_words.iloc[:, 0]

                    # chi-squared test to keep only relevent features
                    # bag_vec = DictVectorizer(sparse=False).fit_transform(bag)
                    # chi2_statistics, p = chi2(bag_vec, y)
                    # relevant_features = np.where(
                    #    chi2_statistics >= self.chi2_threshold)[0]

                    # merging bag-of-patterns of different window_sizes
                    # to single bag-of-patterns with prefix indicating
                    # the used window-length
                    for j in range(len(bag)):
                        for (key, value) in bag[j].items():
                            # if key in relevant_features:  # chi-squared test
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = (key << self.highest_bit) | window_size
                            # X_all_words[j].append((word, value))
                            all_words[j][word] = value

                # TODO use CountVectorizer instead on actual words ... ???
                vectorizer = DictVectorizer(sparse=True)
                bag_vec = vectorizer.fit_transform(all_words)

                clf = LogisticRegression(max_iter=5000,
                                         solver="liblinear",
                                         dual=True,
                                         penalty="l2",
                                         random_state=self.random_state)
                current_acc = cross_val_score(clf, bag_vec, y, cv=5).mean()

                # clf = RandomForestClassifier(oob_score=True,
                #                              n_estimators=1000,
                #                              n_jobs=-1).fit(bag_vec, y)
                # current_acc = clf.oob_score_

                # print("Train acc:", norm, word_length, current_acc)

                if current_acc > max_acc:
                    max_acc = current_acc
                    self.vectorizer = vectorizer
                    self.clf = clf
                    self.SFA_transformers = transformers
                    self.best_word_length = word_length
                    final_bag_vec = bag_vec

                if max_acc == 1.0:
                    break  # there can be no better model than 1.0

        # # fit final model using all words
        # for i, window_size in enumerate(self.window_sizes):
        #     self.SFA_transformers[i] = \
        #         SFA(word_length=np.max(self.word_lengths),
        #             alphabet_size=self.alphabet_size,
        #             window_size=window_size,
        #             norm=norm,
        #             anova=self.anova,
        #             binning_method=self.binning_strategy,
        #             bigrams=self.bigrams,
        #             remove_repeat_words=False,
        #             lower_bounding=False,
        #             save_words=False)
        #     self.SFA_transformers[i].fit_transform(X, y)

        self.clf.fit(final_bag_vec, y)
        self._is_fitted = True
        return self