Ejemplo n.º 1
0
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=True,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.random_state = random_state

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=save_words,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualBOSS, self).__init__()
Ejemplo n.º 2
0
        def _parallel_fit(window_size, ):
            rng = check_random_state(window_size)
            all_words = [dict() for x in range(len(X))]
            relevant_features_count = 0

            # for window_size in self.window_sizes:
            transformer = SFA(
                word_length=rng.choice(self.word_lengths),
                alphabet_size=self.alphabet_size,
                window_size=window_size,
                norm=rng.choice(self.norm_options),
                anova=self.anova,
                # levels=rng.choice([1, 2, 3]),
                binning_method=self.binning_strategy,
                bigrams=self.bigrams,
                remove_repeat_words=False,
                lower_bounding=False,
                save_words=False,
            )

            sfa_words = transformer.fit_transform(X, y)

            # self.SFA_transformers.append(transformer)
            bag = sfa_words[0]
            apply_chi_squared = self.p_threshold < 1

            # chi-squared test to keep only relevant features
            if apply_chi_squared:
                vectorizer = DictVectorizer(sparse=True,
                                            dtype=np.int32,
                                            sort=False)
                bag_vec = vectorizer.fit_transform(bag)

                chi2_statistics, p = chi2(bag_vec, y)
                relevant_features_idx = np.where(p <= self.p_threshold)[0]
                relevant_features = set(
                    np.array(vectorizer.feature_names_)[relevant_features_idx])
                relevant_features_count += len(relevant_features_idx)

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # chi-squared test
                        if (not apply_chi_squared) or (key
                                                       in relevant_features):
                            # append the prefixes to the words to
                            # distinguish between window-sizes
                            word = WEASEL.shift_left(key, self.highest_bit,
                                                     window_size)
                            all_words[j][word] = value

                return all_words, transformer, relevant_features_count
Ejemplo n.º 3
0
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        levels=1,
        igb=False,
        alphabet_size=4,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.levels = levels
        self.igb = igb
        self.alphabet_size = alphabet_size

        self.random_state = random_state

        binning_method = "information-gain" if igb else "equi-depth"

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            levels=levels,
            binning_method=binning_method,
            bigrams=True,
            remove_repeat_words=True,
            save_words=False,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualTDE, self).__init__()
Ejemplo n.º 4
0
class IndividualTDE(BaseClassifier):
    """Single TDE classifier, based off the Bag of SFA Symbols (BOSS) model"""
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        levels=1,
        igb=False,
        alphabet_size=4,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.levels = levels
        self.igb = igb
        self.alphabet_size = alphabet_size

        self.random_state = random_state

        binning_method = "information-gain" if igb else "equi-depth"

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            levels=levels,
            binning_method=binning_method,
            bigrams=True,
            remove_repeat_words=True,
            save_words=False,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualTDE, self).__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X, y)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]  # .iloc[:, 0]

        for test_bag in test_bags:
            best_sim = -1
            nn = None

            for n, bag in enumerate(self.transformed_data):
                sim = histogram_intersection(test_bag, bag)

                if sim > best_sim or (sim == best_sim and rng.random() < 0.5):
                    best_sim = sim
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)

    def predict_proba(self, X):
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_sim = -1
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            sim = histogram_intersection(test_bag, bag)

            if sim > best_sim:
                best_sim = sim
                nn = self.class_vals[n]

        return nn
Ejemplo n.º 5
0
    def fit(self, X, y):
        """Build a WEASEL+MUSE classifiers from the training set (X, y),

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_instances, 1]
            Nested dataframe with univariate time-series in cells.
        y : array-like, shape = [n_instances] The class labels.

        Returns
        -------
        self : object
        """

        X, y = check_X_y(X, y, coerce_to_pandas=True)
        y = np.asarray(y)

        # add first order differences in each dimension to TS
        if self.use_first_order_differences:
            X = self.add_first_order_differences(X)

        # Window length parameter space dependent on series length
        self.col_names = X.columns

        rng = check_random_state(self.random_state)

        self.n_dims = len(self.col_names)
        self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1
        self.highest_bits = np.zeros(self.n_dims)

        self.SFA_transformers = [[] for _ in range(self.n_dims)]

        # the words of all dimensions and all time series
        all_words = [dict() for _ in range(X.shape[0])]

        # On each dimension, perform SFA
        for ind, column in enumerate(self.col_names):
            X_dim = X[[column]]
            X_dim = from_nested_to_3d_numpy(X_dim)
            series_length = X_dim.shape[
                -1]  # TODO compute minimum over all ts ?

            # increment window size in steps of 'win_inc'
            win_inc = self.compute_window_inc(series_length)

            self.max_window = int(min(series_length, self.max_window))
            self.window_sizes.append(
                list(range(self.min_window, self.max_window, win_inc)))

            self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1

            for window_size in self.window_sizes[ind]:

                transformer = SFA(
                    word_length=rng.choice(self.word_lengths),
                    alphabet_size=self.alphabet_size,
                    window_size=window_size,
                    norm=rng.choice(self.norm_options),
                    anova=self.anova,
                    binning_method=rng.choice(self.binning_strategies),
                    bigrams=self.bigrams,
                    remove_repeat_words=False,
                    lower_bounding=False,
                    save_words=False,
                )

                sfa_words = transformer.fit_transform(X_dim, y)

                self.SFA_transformers[ind].append(transformer)
                bag = sfa_words[0]

                # chi-squared test to keep only relevant features
                relevant_features = {}
                apply_chi_squared = self.p_threshold < 1
                if apply_chi_squared:
                    vectorizer = DictVectorizer(sparse=True,
                                                dtype=np.int32,
                                                sort=False)
                    bag_vec = vectorizer.fit_transform(bag)

                    chi2_statistics, p = chi2(bag_vec, y)
                    relevant_features_idx = np.where(p <= self.p_threshold)[0]
                    relevant_features = set(
                        np.array(
                            vectorizer.feature_names_)[relevant_features_idx])

                # merging bag-of-patterns of different window_sizes
                # to single bag-of-patterns with prefix indicating
                # the used window-length
                highest = np.int32(self.highest_bits[ind])
                for j in range(len(bag)):
                    for (key, value) in bag[j].items():
                        # chi-squared test
                        if (not apply_chi_squared) or (key
                                                       in relevant_features):
                            # append the prefices to the words to
                            # distinguish between window-sizes
                            word = MUSE.shift_left(key, highest, ind,
                                                   self.highest_dim_bit,
                                                   window_size)
                            all_words[j][word] = value

        self.clf = make_pipeline(
            DictVectorizer(sparse=True, sort=False),
            # StandardScaler(with_mean=True, copy=False),
            LogisticRegression(
                max_iter=5000,
                solver="liblinear",
                dual=True,
                # class_weight="balanced",
                penalty="l2",
                random_state=self.random_state,
            ),
        )

        self.clf.fit(all_words, y)
        self._is_fitted = True
        return self
Ejemplo n.º 6
0
class IndividualBOSS(BaseClassifier):
    """Single Bag of SFA Symbols (BOSS) classifier

    Bag of SFA Symbols Ensemble: implementation of BOSS from Schaffer :
    @article
    """
    def __init__(
        self,
        window_size=10,
        word_length=8,
        norm=False,
        alphabet_size=4,
        save_words=True,
        random_state=None,
    ):
        self.window_size = window_size
        self.word_length = word_length
        self.norm = norm
        self.alphabet_size = alphabet_size

        self.save_words = save_words
        self.random_state = random_state

        self.transformer = SFA(
            word_length=word_length,
            alphabet_size=alphabet_size,
            window_size=window_size,
            norm=norm,
            remove_repeat_words=True,
            bigrams=False,
            save_words=save_words,
        )
        self.transformed_data = []
        self.accuracy = 0

        self.class_vals = []
        self.num_classes = 0
        self.classes_ = []
        self.class_dictionary = {}
        super(IndividualBOSS, self).__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True)

        sfa = self.transformer.fit_transform(X)
        self.transformed_data = sfa[0]  # .iloc[:, 0]

        self.class_vals = y
        self.num_classes = np.unique(y).shape[0]
        self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0]
        for index, classVal in enumerate(self.classes_):
            self.class_dictionary[classVal] = index

        self._is_fitted = True
        return self

    def predict(self, X):
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)

        rng = check_random_state(self.random_state)

        classes = []
        test_bags = self.transformer.transform(X)
        test_bags = test_bags[0]  # .iloc[:, 0]

        for test_bag in test_bags:
            best_dist = sys.float_info.max
            nn = None

            for n, bag in enumerate(self.transformed_data):
                dist = boss_distance(test_bag, bag, best_dist)

                if dist < best_dist or (dist == best_dist
                                        and rng.random() < 0.5):
                    best_dist = dist
                    nn = self.class_vals[n]

            classes.append(nn)

        return np.array(classes)

    def predict_proba(self, X):
        preds = self.predict(X)
        dists = np.zeros((X.shape[0], self.num_classes))

        for i in range(0, X.shape[0]):
            dists[i, self.class_dictionary.get(preds[i])] += 1

        return dists

    def _train_predict(self, train_num):
        test_bag = self.transformed_data[train_num]
        best_dist = sys.float_info.max
        nn = None

        for n, bag in enumerate(self.transformed_data):
            if n == train_num:
                continue

            dist = boss_distance(test_bag, bag, best_dist)

            if dist < best_dist:
                best_dist = dist
                nn = self.class_vals[n]

        return nn

    def _shorten_bags(self, word_len):
        new_boss = IndividualBOSS(
            self.window_size,
            word_len,
            self.norm,
            self.alphabet_size,
            save_words=self.save_words,
            random_state=self.random_state,
        )
        new_boss.transformer = self.transformer
        sfa = self.transformer._shorten_bags(word_len)
        new_boss.transformed_data = sfa[0]  # .iloc[:, 0]

        new_boss.class_vals = self.class_vals
        new_boss.num_classes = self.num_classes
        new_boss.classes_ = self.classes_
        new_boss.class_dictionary = self.class_dictionary

        new_boss._is_fitted = True
        return new_boss

    def _clean(self):
        self.transformer.words = None
        self.transformer.save_words = False

    def _set_word_len(self, word_len):
        self.word_length = word_len
        self.transformer.word_length = word_len