Ejemplo n.º 1
0
def test_shapelets():
    pytest.importorskip('keras')
    from tslearn.shapelets import ShapeletModel

    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)
    clf.fit(time_series, y)
    np.testing.assert_allclose(clf.shapelets_[0],
                               np.array([[0.56373, 0.494684],
                                         [1.235707, 1.119235]]),
                               atol=1e-2)
    np.testing.assert_allclose(
        clf.predict(time_series),
        np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]))

    cross_validate(clf, time_series, y, cv=2)

    model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1)
    model.fit(time_series, y)
    for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_):
        np.testing.assert_allclose(shp,
                                   to_time_series(shp_bis, remove_nans=True))
Ejemplo n.º 2
0
def test_shapelets():
    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)
    clf.fit(time_series, y)
    np.testing.assert_allclose(clf.shapelets_[0],
                               np.array([[0.56373, 0.494684],
                                         [1.235707, 1.119235]]),
                               atol=1e-2)
    np.testing.assert_allclose(
        clf.predict(time_series),
        np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]))

    from sklearn.model_selection import cross_validate
    cross_validate(clf, time_series, y, cv=2)
Ejemplo n.º 3
0
# Set the number of shapelets per size as done in the original paper
shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts,
                                                       ts_sz=ts_sz,
                                                       n_classes=n_classes,
                                                       l=0.1,
                                                       r=2)

# Define the model using parameters provided by the authors (except that we use
# fewer iterations here)
shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                        optimizer=Adagrad(lr=.1),
                        weight_regularizer=.01,
                        max_iter=200,
                        verbose=0)
shp_clf.fit(X_train, y_train)
predicted_labels = shp_clf.predict(X_test)
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

plt.figure()
for i, sz in enumerate(shapelet_sizes.keys()):
    plt.subplot(len(shapelet_sizes), 1, i + 1)
    plt.title("%d shapelets of size %d" % (shapelet_sizes[sz], sz))
    for shp in shp_clf.shapelets_:
        if ts_size(shp) == sz:
            plt.plot(shp.ravel())
    plt.xlim([0, max(shapelet_sizes.keys()) - 1])

plt.tight_layout()
plt.show()

# The loss history is accessible via the `model` attribute that is a keras
def classify_with_shapelets():
    from keras.optimizers import Adagrad
    from tslearn.datasets import CachedDatasets
    from tslearn.preprocessing import TimeSeriesScalerMinMax
    from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict

    feat_path = sys.argv[1]
    label_type = sys.argv[2]
    results_base_path = sys.argv[3]
    data_path = sys.argv[4]

    pitch_path = os.path.join(data_path, 'pitch.txt')
    energy_path = os.path.join(data_path, 'energy.txt')

    #Raw pitch and energy
    raw_pitch, raw_energy = read_pitch_energy(pitch_path, energy_path)

    # Tunable Parameters = shapelet length, threshold, shapelet redundancy value
    # sweep shapelet length
    pitch_shapelet = {}
    energy_shapelet = {}
    for shapelet_len in [10, 25, 50]:
        for spkr in raw_pitch:
            # Compute shapelets from raw frames (i.e. no segmented info like where phone/word is)
            pitch_shapelet[spkr] = compute_shapelet_frame(
                raw_pitch[spkr], shapelet_len, True)
            # energy_shapelet[spkr] = compute_shapelet_frame(raw_energy[spkr], shapelet_len)

            # pitch_shapelet[spkr] = np.array(raw_pitch[spkr])
            # print(len(raw_pitch[spkr]))
            # exit()

        acc = []
        for sim in range(10):
            y_true = []
            y_pred = []
            for spkr in tqdm(late_balanced.keys()):
                test_spkr = [spkr]
                train_spkrs = late_balanced.keys()
                train_spkrs.remove(test_spkr[0])

                X_train = np.array([
                    np.array(shapelet).reshape(shapelet_len, 1)
                    for x in train_spkrs for shapelet in pitch_shapelet[x]
                ])
                y_train = np.array([
                    late_balanced[x] for x in train_spkrs
                    for shapelet in pitch_shapelet[x]
                ])

                X_test = np.array([
                    np.array(shapelet).reshape(shapelet_len, 1)
                    for x in test_spkr for shapelet in pitch_shapelet[x]
                ])
                y_test = np.array([
                    late_balanced[x] for x in test_spkr
                    for shapelet in pitch_shapelet[x]
                ])

                # print('train data', X_train.shape)
                # #print('train data first', X_train[0])
                # print('train label', y_train.shape)
                # exit()

                shapelet_sizes = grabocka_params_to_shapelet_size_dict(
                    n_ts=X_train.shape[0],
                    ts_sz=X_train.shape[1],
                    n_classes=len(set(y_train)),
                    l=0.1,
                    r=2)

                shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                                        optimizer=Adagrad(lr=.1),
                                        weight_regularizer=.01,
                                        max_iter=50,
                                        verbose_level=0)
                shp_clf.fit(X_train, y_train)

                predicted_locations = shp_clf.locate(X_test)

                print('predicted_locations.shape', predicted_locations.shape)
                # test_ts_id = 0
                # plt.figure()
                # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values()))
                # plt.plot(X_test[test_ts_id].ravel())
                # for idx_shp, shp in enumerate(shp_clf.shapelets_):
                #     t0 = predicted_locations[test_ts_id, idx_shp]
                #     plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2)

                # plt.tight_layout()
                # plt.savefig(test_ts_id+'_test.png', format='png')
                # exit()

                prediction = shp_clf.predict(X_test)
                prediction_prob = shp_clf.predict_proba(X_test)

                y_pred += prediction.tolist()
                y_true += y_test.tolist()

            ###After LOO
            # test_ts_id = 0
            # plt.figure()
            # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values()))
            # plt.plot(X_test[test_ts_id].ravel())
            # for idx_shp, shp in enumerate(shp_clf.shapelets_):
            #     t0 = predicted_locations[test_ts_id, idx_shp]
            #     plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2)

            # plt.tight_layout()
            # plt.savefig('test.png', format='png')

            local_acc = balanced_accuracy_score(y_true, y_pred)
            acc.append(local_acc)
        # print('acc', acc)
        # print('final acc', np.mean(acc))
        # print('final acc std', np.std(acc))

        if not os.path.exists(os.path.join(results_base_path, 'regression')):
            os.makedirs(os.path.join(results_base_path, 'regression'))
        results_file = os.path.join(results_base_path, 'regression',
                                    'shapelet_' + str(len_feats) + '.txt')

        with open(results_file, 'w') as w:
            # w.write("Confusion Matrix\n")
            # # w.write(confusion_matrix(y_true, y_pred).tolist())
            # w.write('{}\n\n'.format(confusion_matrix(y_true, y_pred)))

            w.write('regression: {} ({})\n'.format(np.mean(acc_list),
                                                   np.std(acc_list)))
            w.write('baseline: {} ({})'.format(np.mean(acc_baseline),
                                               np.std(acc_baseline)))
            w.write("\nFeature Importance\n")
            for i in tot_importance:
                tot_importance[i] = np.mean(tot_importance[i])
            for i in sorted(tot_importance.items(),
                            key=operator.itemgetter(1),
                            reverse=True):
                w.write("{} = {}\n".format(i[0], i[1]))
Ejemplo n.º 5
0
class Shapelets():
    def __init__(self,
                 epochs,
                 length,
                 num_shapelet_lengths,
                 num_shapelets,
                 learning_rate,
                 weight_regularizer,
                 batch_size=256,
                 optimizer=Adam):
        '''
            initialize shapelet hyperparameters

            hyperparameters:
                epochs                : number of training epochs
                length                : base shapelet length, expressed as fraction of length of time series
                num_shapelet_lengths  : number of different shapelet lengths
                num_shapelets         : number of unique shapelets to learn at each shapelet length, 
                                        expressed as fraction of length of time series
                learning rate         : learning rate of Keras optimizer
                weight regularizer    : weight regularization used when fitting model
        '''
        self.epochs = epochs
        self.length = length
        self.num_shapelet_lengths = num_shapelet_lengths
        self.num_shapelets = num_shapelets
        self.weight_regularizer = weight_regularizer
        self.batch_size = batch_size
        self.optimizer = optimizer(lr=learning_rate)
        self.shapelet_sizes = None
        self.shapelet_clf = None
        self.encoder = LabelEncoder()

    def clear_session(self):
        try:
            assert (self.shapelet_clf is not None)
        except:
            raise ValueError(
                "Cannot clear session that has not been initialized")
        self.shapelet_clf.clear_session()
        return

    def load_model(self, series_length, labels, checkpoint):
        '''
            Load model from checkpoint into Shapelet classifier
        '''
        if self.shapelet_clf is None:
            base_size = int(self.length * series_length)
            self.shapelet_sizes = {}
            for sz_idx in range(self.num_shapelet_lengths):
                shp_sz = base_size * (sz_idx + 1)
                self.shapelet_sizes[shp_sz] = int(self.num_shapelets *
                                                  series_length)
            self.shapelet_clf = ShapeletModel(
                n_shapelets_per_size=self.shapelet_sizes,
                optimizer=self.optimizer,
                weight_regularizer=self.weight_regularizer,
                max_iter=self.epochs,
                batch_size=self.batch_size)

        # first generate new model into which to load the weights
        self.encode(labels)
        self.shapelet_clf.generate_model(series_length,
                                         len(self.get_classes()))

        # load weights
        self.shapelet_clf.model.load_weights(checkpoint)

    def fit_transfer_model(self,
                           X_train,
                           y_train,
                           checkpoint,
                           nclasses_prior=2,
                           source_dir=None,
                           val_data=None):
        # encode training and validation labels
        y_train = self.encode(y_train)
        y_val = self.encode(val_data[1])

        # scale training and validation data to between 0 and 1
        X_train_scaled = self.__ScaleData(X_train)
        X_val_scaled = self.__ScaleData(val_data[0])

        if self.shapelet_clf is None:
            base_size = int(self.length * X_train.shape[1])
            self.shapelet_sizes = {}
            for sz_idx in range(self.num_shapelet_lengths):
                shp_sz = base_size * (sz_idx + 1)
                self.shapelet_sizes[shp_sz] = int(self.num_shapelets *
                                                  X_train.shape[1])
            self.shapelet_clf = ShapeletModel(
                n_shapelets_per_size=self.shapelet_sizes,
                optimizer=self.optimizer,
                weight_regularizer=self.weight_regularizer,
                max_iter=self.epochs,
                batch_size=self.batch_size)

        # fit shapelet classifier
        self.shapelet_clf.fit_transfer_model(X_train_scaled, y_train,
                                             nclasses_prior, checkpoint,
                                             source_dir, (X_val_scaled, y_val))

    def fit(self, X_train, y_train, source_dir=None, val_data=None):
        '''
            fit shapelet classifier on training data

            parameters:
                X_train                : training time series
                y_train                : training labels
        '''
        if self.shapelet_clf is None:
            base_size = int(self.length * X_train.shape[1])
            self.shapelet_sizes = {}
            for sz_idx in range(self.num_shapelet_lengths):
                shp_sz = base_size * (sz_idx + 1)
                self.shapelet_sizes[shp_sz] = int(self.num_shapelets *
                                                  X_train.shape[1])
            self.shapelet_clf = ShapeletModel(
                n_shapelets_per_size=self.shapelet_sizes,
                optimizer=self.optimizer,
                weight_regularizer=self.weight_regularizer,
                max_iter=self.epochs,
                batch_size=self.batch_size)

        # encode training and validation labels
        y_train = self.encode(y_train)
        y_val = self.encode(val_data[1])

        # scale training and validation data to between 0 and 1
        X_train_scaled = self.__ScaleData(X_train)
        X_val_scaled = self.__ScaleData(val_data[0])

        # fit classifier
        self.shapelet_clf.fit(X_train_scaled, y_train, source_dir,
                              (X_val_scaled, y_val))

    def __ScaleData(self, input_data):
        ''' 
            scale input data to range [0,1]

            parameters:
                input_data        : input data to rescale
        '''

        return TimeSeriesScalerMinMax().fit_transform(input_data)

    def predict(self, X_test):
        '''
            classifications for time series in test data set

            parameters:
                X_test:     test time series on which to predict classes

            returns: classifications for test data set
        '''
        X_test_scaled = self.__ScaleData(X_test)
        return self.shapelet_clf.predict(X_test_scaled)

    def predict_proba(self, X_test):
        '''
            class probabilities for time series in test data set

            parameters:
                X_test:     test time series on which to predict classes

            returns: classifications for test data set
        '''
        X_test_scaled = self.__ScaleData(X_test)
        return self.shapelet_clf.predict_proba(X_test_scaled)

    def encode(self, categories):
        '''
            fit label encoder on input categories. returns transformed categories
        '''
        self.encoder.fit(categories)
        return self.encoder.transform(categories)

    def decode(self, y_probs, p_threshold):
        '''
            decode prediction probabilities y_probs into prediction / confidence give p_threshold
        '''
        prob_max = np.amax(y_probs, axis=1)
        prediction_indices = prob_max > p_threshold
        y_pred = np.zeros(y_probs.shape[0])

        # reintepret confidence in binary case
        if y_probs.shape[1] == 1:
            y_pred[prediction_indices] = 1
            confidence = (prob_max - p_threshold) / (y_pred - p_threshold)
            confidence = 0.5 + confidence / 2
        else:
            y_pred[prediction_indices] = np.argmax(y_probs,
                                                   axis=1)[prediction_indices]
            confidence = prob_max
        y_pred = y_pred.astype(int)
        y_preds = self.encoder.inverse_transform(y_pred)

        return y_preds, confidence

    def get_classes(self):
        '''
            get original classes from encoder
        '''
        try:
            assert (self.encoder is not None)
        except:
            raise ValueError("Encoder has not been initialized")
        return self.encoder.classes_

    def VisualizeShapelets(self):
        '''
            visualize all of shapelets learned by shapelet classifier
        '''
        plt.figure()
        for i, sz in enumerate(self.shapelet_sizes.keys()):
            plt.subplot(len(self.shapelet_sizes), 1, i + 1)
            plt.title("%d shapelets of size %d" %
                      (self.shapelet_sizes[sz], sz))
            for shapelet in self.shapelet_clf.shapelets_:
                if ts_size(shapelet) == sz:
                    plt.plot(shapelet.ravel())
            plt.xlim([0, max(self.shapelet_sizes.keys())])
        plt.tight_layout()
        plt.show()

    def VisualizeShapeletLocations(self,
                                   series_values,
                                   series_id,
                                   save_dir='visualizations',
                                   name='shp_1'):
        '''
            visualize shapelets superimposed on one of the test series

            parameters:
                series_values:      raw values on which to visualize shapelets
                series_id:          id of time series to visualize  
                save_dir            directory in which to save visualizations
                name                name under which to save viz (bc unique every time)
                n_shapelets:        
        '''

        plt.style.use("seaborn-whitegrid")

        # NK brand colors
        COLORS = [
            "#FA5655", "#F79690", "#B9BC2D", "#86B6B2", "#955B99", "#252B7A"
        ]
        # others? "#8D6B2C",
        # "#D0A826",
        # "#FEDB03",
        # "#000000",
        # "#454545",
        # "#FFFFFF",
        # "#F8F6F1"]
        n_rows, n_cols, _ = series_values.shape
        test_series = series_values[series_id].reshape(-1, )

        closest_inds = self.shapelet_clf.locate(test_series.reshape(1, -1,
                                                                    1))[0]
        closest_dists = []
        for ind, shp in zip(closest_inds, self.shapelet_clf.shapelets_):
            closest_dists.append(
                np.linalg.norm(test_series[ind:ind + shp.shape[0]] - shp))
        closest_dists = np.array(closest_dists)

        # convert distance to weight where dist=0 -> wt=1 and dist=inf -> wt=0
        sl_weights = 1 / (1 + closest_dists)
        # plot the signal with matching shapelets color overlayed
        plt.clf()
        plt.plot(range(n_cols), test_series, color="k")
        for ind, sl, wt, color in zip(closest_inds,
                                      self.shapelet_clf.shapelets_, sl_weights,
                                      COLORS):
            # find closest match
            t = range(ind, ind + sl.shape[0])
            match = test_series[ind:ind + sl.shape[0]]
            # plot shapelet on top of signal width width and alpha set by shapelet weight
            plt.plot(t, match, alpha=7 * wt, linewidth=35 * wt, color=color)
        plt.ylabel('Email Density')
        plt.xlabel('Minute of the Hour')
        plt.show()
        #plt.savefig(save_dir + "/{}_signal_size_{}_id_{}.png".format(name, n_cols, series_id))

        # plot shapelets
        plt.clf()
        # to plot the shapelets, switch to dark background
        plt.style.use("seaborn-darkgrid")
        # ax = plt.axes()  # used below for sharex, sharey (if needed?)

        # arange shapletes in grid - find greatest factor of n_shapelets
        gf = 0
        shp_t = self.shapelet_clf.shapelets_as_time_series_
        shp = self.shapelet_clf.shapelets_
        for i in range(1, shp.shape[0]):
            if shp.shape[0] % i == 0:
                gf = i
        of = int(shp.shape[0] / gf)
        n_cols = 2
        for i in range(shp_t.shape[0]):
            ax_i = plt.subplot(gf, of, i + 1)
            # we could force them to share the same axes
            # ax_i = plt.subplot(n_rows, n_cols, i + 1, sharex=ax, sharey=ax)
            #ax_i.set_xticklabels([])
            ax_i.set_yticklabels([])
            plt.plot(range(shp_t.shape[1]),
                     shp[i].reshape(-1),
                     color=COLORS[i % len(COLORS)],
                     linewidth=3)
            plt.xlabel('Shapelet Length')
        plt.show()