def cluster(timeseries_df,data_labels):
    Shapelet_list = []
    D = '/home/abhilash/Datasets/UCRArchive_2018/TwoLeadECG/TXT_Files/'
    for i in range(1,max(data_labels)+1):
        print('Class',i)
        ts_df=timeseries_df[timeseries_df['0']==i]
        ts_df=ts_df.reset_index(drop=True)
        labels = ts_df['0']
        ts_df = ts_df.drop(ts_df.columns[0], axis=1)
        S='class'+str(i)+'labels.txt'
        pred_label=pd.read_csv(D+S,header=None)
        pred_label=np.ravel(np.array(pred_label))

        shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=ts_df.shape[0],
                                                               ts_sz=ts_df.shape[1],
                                                               n_classes=2,
                                                               l=0.36,
                                                               r=1)
        shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                            optimizer=Adagrad(lr=.1),
                            weight_regularizer=.01,
                            max_iter=50,
                            verbose=0)
        shp_clf.fit(ts_df, pred_label)
        shapelets=shp_clf.shapelets_;
        temp_list=[]
        for i in range(0, shapelets.shape[0]):
            temp= shapelets[i].T
            temp_list.append(temp[0])
        Shapelet_list.append(temp_list)
    return Shapelet_list
def cluster(shape,timeseries_df,data_labels,k):
    Shapelet_list = []
    for i in range(1,max(data_labels)+1):
        ts_df=timeseries_df[timeseries_df['0']==i]
        ts_df=ts_df.reset_index(drop=True)
        # cluster_list.append(extractU_Shapelets(shape)
        labels = ts_df['0']
        ts_df = ts_df.drop(ts_df.columns[0], axis=1)
        S=uShapeletClustering.extractU_Shapelets.extract_Shapelets(ts_df.copy(),shape,k)
        S = np.array(list(S))
        pred_label=uShapeletClustering.Kmeans.Kmeans(ts_df.copy(),S,k,labels)
        shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=ts_df.shape[0],
                                                               ts_sz=ts_df.shape[1],
                                                               n_classes=2,
                                                               l=0.5,
                                                               r=1)
        shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                            optimizer=Adagrad(lr=.1),
                            weight_regularizer=.01,
                            max_iter=50,
                            verbose=0)
        shp_clf.fit(ts_df, pred_label)
        shapelets=shp_clf.shapelets_;
        temp_list=[]
        for i in range(0, shapelets.shape[0]):
            temp= shapelets[i].T
            temp_list.append(temp[0])
        Shapelet_list.append(temp_list)
    return Shapelet_list
Beispiel #3
0
 def fit(self, x, y=None):
     clf = ShapeletModel(
         n_shapelets_per_size={self.input_span: self.state_num},
         weight_regularizer=.01,
         verbose_level=0)
     clf.fit(x, y)
     joblib.dump(clf, self.modelpath / 'states.m')
Beispiel #4
0
def test_shapelets():
    pytest.importorskip('keras')
    from tslearn.shapelets import ShapeletModel

    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)
    clf.fit(time_series, y)
    np.testing.assert_allclose(clf.shapelets_[0],
                               np.array([[0.56373, 0.494684],
                                         [1.235707, 1.119235]]),
                               atol=1e-2)
    np.testing.assert_allclose(
        clf.predict(time_series),
        np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]))

    cross_validate(clf, time_series, y, cv=2)

    model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1)
    model.fit(time_series, y)
    for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_):
        np.testing.assert_allclose(shp,
                                   to_time_series(shp_bis, remove_nans=True))
Beispiel #5
0
def fit_lts(X_train, y_train, X_test, y_test, shap_dict, reg, max_it, shap_out_path, pred_out_path, timing_out_path):
    # Fit LTS model, print metrics on test-set, write away predictions and shapelets
    clf = ShapeletModel(n_shapelets_per_size=shap_dict, 
                        max_iter=max_it, verbose_level=0, batch_size=1,
                        optimizer='sgd', weight_regularizer=reg)

    start = time.time()
    clf.fit(
        np.reshape(
            X_train, 
            (X_train.shape[0], X_train.shape[1], 1)
        ), 
        y_train
    )
    learning_time = time.time() - start

    with open(shap_out_path, 'w+') as ofp:
        for shap in clf.shapelets_:
            ofp.write(str(np.reshape(shap, (-1))) + '\n')

    with open(timing_out_path, 'w+') as ofp:
        ofp.write(str(learning_time))

    X_distances_train = clf.transform(X_train)
    X_distances_test = clf.transform(X_test)

    fit_lr(X_distances_train, y_train, X_distances_test, y_test, pred_out_path)
Beispiel #6
0
def test_shapelet_lengths():
    pytest.importorskip('tensorflow')
    from tslearn.shapelets import ShapeletModel

    # Test variable-length
    y = [0, 1]
    time_series = to_time_series_dataset([[1, 2, 3, 4, 5], [3, 2, 1]])
    clf = ShapeletModel(n_shapelets_per_size={3: 1},
                        max_iter=1,
                        verbose=0,
                        random_state=0)
    clf.fit(time_series, y)

    weights_shapelet = [np.array([[1, 2, 3]])]
    clf.set_weights(weights_shapelet, layer_name="shapelets_0_0")
    tr = clf.transform(time_series)
    np.testing.assert_allclose(tr,
                               np.array([[0.], [8. / 3]]))

    # Test max_size to predict longer series than those passed at fit time
    y = [0, 1]
    time_series = to_time_series_dataset([[1, 2, 3, 4, 5], [3, 2, 1]])
    clf = ShapeletModel(n_shapelets_per_size={3: 1},
                        max_iter=1,
                        verbose=0,
                        max_size=6,
                        random_state=0)
    clf.fit(time_series[:, :-1], y)  # Fit with size 4
    weights_shapelet = [np.array([[1, 2, 3]])]
    clf.set_weights(weights_shapelet, layer_name="shapelets_0_0")
    tr = clf.transform(time_series)
    np.testing.assert_allclose(tr,
                               np.array([[0.], [8. / 3]]))
Beispiel #7
0
def lts_discovery(X_train, y_train, X_test, y_test,  nr_shap, l, r, reg, max_it, shap_out_path, pred_out_path, timing_out_path):
    # Fit LTS model, print metrics on test-set, write away predictions and shapelets
    shapelet_dict = grabocka_params_to_shapelet_size_dict(
            X_train.shape[0], X_train.shape[1], int(nr_shap*X_train.shape[1]), l, r
    )
    
    clf = ShapeletModel(n_shapelets_per_size=shapelet_dict, 
                        max_iter=max_it, verbose_level=0, batch_size=1,
                        optimizer='sgd', weight_regularizer=reg)

    start = time.time()
    clf.fit(
        np.reshape(
            X_train, 
            (X_train.shape[0], X_train.shape[1], 1)
        ), 
        y_train
    )
    learning_time = time.time() - start

    print('Learning shapelets took {}s'.format(learning_time))

    with open(shap_out_path, 'w+') as ofp:
        for shap in clf.shapelets_:
            ofp.write(str(np.reshape(shap, (-1))) + '\n')

    with open(timing_out_path, 'w+') as ofp:
        ofp.write(str(learning_time))

    X_distances_train = clf.transform(X_train)
    X_distances_test = clf.transform(X_test)

    fit_lr(X_distances_train, y_train, X_distances_test, y_test, pred_out_path)
def learningShapeletClassifier(X_train, Y_train):

    shapelet_sizes = grabocka_params_to_shapelet_size_dict(
        n_ts=X_train.shape[0],
        ts_sz=X_train.shape[1],
        n_classes=len(set(Y_train)),
        l=0.1,
        r=2)
    shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                            optimizer=Adagrad(lr=.1),
                            weight_regularizer=.01,
                            max_iter=200,
                            verbose_level=0)

    shp_clf.fit(X_train, Y_train)
    return shp_clf
def test_serialize_shapelets():
    def get_model_weights(model):
        return model.model_.get_weights()

    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)

    for y in [rng.randint(low=0, high=3, size=n),
              rng.choice(["one", "two", "three"], size=n)]:

        shp = ShapeletModel(max_iter=1)
        _check_not_fitted(shp)
        shp.fit(X, y)
        _check_params_predict(shp, X, ['predict'],
                              check_params_fun=get_model_weights,
                              formats=["json", "pickle"])
Beispiel #10
0
def test_shapelets():
    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)
    clf.fit(time_series, y)
    np.testing.assert_allclose(clf.shapelets_[0],
                               np.array([[0.56373, 0.494684],
                                         [1.235707, 1.119235]]),
                               atol=1e-2)
    np.testing.assert_allclose(
        clf.predict(time_series),
        np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]))

    from sklearn.model_selection import cross_validate
    cross_validate(clf, time_series, y, cv=2)
Beispiel #11
0
class ShapletStateRecognition(BaseMLModelTemplate):

    def build_model(self, **kwargs):
        self.his_len = kwargs['his_len']
        self.segment_dim = kwargs['segment_dim']

        self.model_obj = ShapeletModel(n_shapelets_per_size={self.segment_dim: self.param.n_state}, weight_regularizer=.01, verbose_level=0)

    def fit(self, x, y=None):
        self.model_obj.fit(x, y)
        self.store(self.param.model_save_path)

    def predict(self, x):
        self.restore(self.param.model_save_path)

        shaplets = []
        for s in self.model_obj.shapelets_:
            shaplets.append(s)
        shaplets = np.reshape(shaplets, [self.param.n_state, self.segment_dim])
        print('shaplets:', shaplets.shape)
        state_pattern = shaplets

        tmpdata = np.reshape(x, [-1, self.his_len, self.segment_dim])
        state_proba = np.zeros([x.shape[0], self.his_len, self.param.n_state], dtype=np.float)
        for i in range(x.shape[0]):
            for j in range(self.his_len):
                for k in range(self.param.n_state):
                    state_proba[i, j, k] = np.sqrt(np.sum(tmpdata[i, j] - shaplets[k]) ** 2)
                state_proba[i, j] = (state_proba[i, j] - min(state_proba[i, j])) / (max(state_proba[i, j]) - min(state_proba[i, j]))
        return np.reshape(state_proba, [-1, self.his_len, self.param.n_state]).astype(np.float32), np.array(state_pattern,dtype=np.float32)

    def store(self, path, **kwargs):
        save_model_name = "shaplet_{}_{}.state_model".format(self.param.data_name, self.param.n_state)
        joblib.dump(self.model_obj, os.path.join(path, save_model_name))

    def restore(self, path, **kwargs):
        save_model_name = "shaplet_{}_{}.state_model".format(self.param.data_name, self.param.n_state)
        self.model_obj = joblib.load(os.path.join(path, save_model_name))
Beispiel #12
0
def test_shapelets():
    pytest.importorskip('tensorflow')
    from tslearn.shapelets import ShapeletModel
    import tensorflow as tf

    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)

    cross_validate(clf, time_series, y, cv=2)

    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer=tf.optimizers.Adam(.1),
                        random_state=0)
    cross_validate(clf, time_series, y, cv=2)

    model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1)
    model.fit(time_series, y)
    for shp, shp_bis in zip(model.shapelets_,
                            model.shapelets_as_time_series_):
        np.testing.assert_allclose(shp,
                                   to_time_series(shp_bis, remove_nans=True))

    # Test set_weights / get_weights
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        random_state=0)
    clf.fit(time_series, y)
    preds_before = clf.predict_proba(time_series)
    weights = clf.get_weights()
    # Change number of iterations, then refit, then set weights
    clf.max_iter *= 2
    clf.fit(time_series, y)
    clf.set_weights(weights)
    np.testing.assert_allclose(preds_before,
                               clf.predict_proba(time_series))
Beispiel #13
0
# Set the number of shapelets per size as done in the original paper
shapelet_sizes = grabocka_params_to_shapelet_size_dict(n_ts=n_ts,
                                                       ts_sz=ts_sz,
                                                       n_classes=n_classes,
                                                       l=0.1,
                                                       r=2)

# Define the model using parameters provided by the authors (except that we use
# fewer iterations here)
shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                        optimizer=Adagrad(lr=.1),
                        weight_regularizer=.01,
                        max_iter=200,
                        verbose=0)
shp_clf.fit(X_train, y_train)
predicted_labels = shp_clf.predict(X_test)
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

plt.figure()
for i, sz in enumerate(shapelet_sizes.keys()):
    plt.subplot(len(shapelet_sizes), 1, i + 1)
    plt.title("%d shapelets of size %d" % (shapelet_sizes[sz], sz))
    for shp in shp_clf.shapelets_:
        if ts_size(shp) == sz:
            plt.plot(shp.ravel())
    plt.xlim([0, max(shapelet_sizes.keys()) - 1])

plt.tight_layout()
plt.show()
Beispiel #14
0
                            population_size=100,
                            iterations=1000,
                            wait=100)
ge.fit(X, y)
gen_shapelet = ge.shapelets[0]

bfe = BruteForceExtractor()
bf_shapelet = bfe.extract(X, y)[0]

clf = ShapeletModel(n_shapelets_per_size={len(ts1): 1},
                    max_iter=5000,
                    verbose_level=0,
                    batch_size=1,
                    optimizer='sgd',
                    weight_regularizer=0)
clf.fit(np.reshape(X, (X.shape[0], X.shape[1], 1)), y)
lts_shapelet = clf.shapelets_[0]

# Plot the shapelets and orderline
fig, ax = plt.subplots(3, 3, sharey=True)

ax[0][0].axis('off')
ax[0][0].annotate('Brute Force', (0, 0.5), fontsize=24, va='center', ha='left')

ax[0][1].axis('off')
ax[0][1].plot(range(len(bf_shapelet)), bf_shapelet, c=cmap(0.))

# TODO: if dist_tsx too close to other dist_tsy, then change y-coordinate so that points do not overlap

dist_ts1 = util.sdist_no_norm(bf_shapelet, ts1)
dist_ts2 = util.sdist_no_norm(bf_shapelet, ts2)
def classify_with_shapelets():
    from keras.optimizers import Adagrad
    from tslearn.datasets import CachedDatasets
    from tslearn.preprocessing import TimeSeriesScalerMinMax
    from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict

    feat_path = sys.argv[1]
    label_type = sys.argv[2]
    results_base_path = sys.argv[3]
    data_path = sys.argv[4]

    pitch_path = os.path.join(data_path, 'pitch.txt')
    energy_path = os.path.join(data_path, 'energy.txt')

    #Raw pitch and energy
    raw_pitch, raw_energy = read_pitch_energy(pitch_path, energy_path)

    # Tunable Parameters = shapelet length, threshold, shapelet redundancy value
    # sweep shapelet length
    pitch_shapelet = {}
    energy_shapelet = {}
    for shapelet_len in [10, 25, 50]:
        for spkr in raw_pitch:
            # Compute shapelets from raw frames (i.e. no segmented info like where phone/word is)
            pitch_shapelet[spkr] = compute_shapelet_frame(
                raw_pitch[spkr], shapelet_len, True)
            # energy_shapelet[spkr] = compute_shapelet_frame(raw_energy[spkr], shapelet_len)

            # pitch_shapelet[spkr] = np.array(raw_pitch[spkr])
            # print(len(raw_pitch[spkr]))
            # exit()

        acc = []
        for sim in range(10):
            y_true = []
            y_pred = []
            for spkr in tqdm(late_balanced.keys()):
                test_spkr = [spkr]
                train_spkrs = late_balanced.keys()
                train_spkrs.remove(test_spkr[0])

                X_train = np.array([
                    np.array(shapelet).reshape(shapelet_len, 1)
                    for x in train_spkrs for shapelet in pitch_shapelet[x]
                ])
                y_train = np.array([
                    late_balanced[x] for x in train_spkrs
                    for shapelet in pitch_shapelet[x]
                ])

                X_test = np.array([
                    np.array(shapelet).reshape(shapelet_len, 1)
                    for x in test_spkr for shapelet in pitch_shapelet[x]
                ])
                y_test = np.array([
                    late_balanced[x] for x in test_spkr
                    for shapelet in pitch_shapelet[x]
                ])

                # print('train data', X_train.shape)
                # #print('train data first', X_train[0])
                # print('train label', y_train.shape)
                # exit()

                shapelet_sizes = grabocka_params_to_shapelet_size_dict(
                    n_ts=X_train.shape[0],
                    ts_sz=X_train.shape[1],
                    n_classes=len(set(y_train)),
                    l=0.1,
                    r=2)

                shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                                        optimizer=Adagrad(lr=.1),
                                        weight_regularizer=.01,
                                        max_iter=50,
                                        verbose_level=0)
                shp_clf.fit(X_train, y_train)

                predicted_locations = shp_clf.locate(X_test)

                print('predicted_locations.shape', predicted_locations.shape)
                # test_ts_id = 0
                # plt.figure()
                # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values()))
                # plt.plot(X_test[test_ts_id].ravel())
                # for idx_shp, shp in enumerate(shp_clf.shapelets_):
                #     t0 = predicted_locations[test_ts_id, idx_shp]
                #     plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2)

                # plt.tight_layout()
                # plt.savefig(test_ts_id+'_test.png', format='png')
                # exit()

                prediction = shp_clf.predict(X_test)
                prediction_prob = shp_clf.predict_proba(X_test)

                y_pred += prediction.tolist()
                y_true += y_test.tolist()

            ###After LOO
            # test_ts_id = 0
            # plt.figure()
            # plt.title("Example locations of shapelet matches (%d shapelets extracted)" % sum(shapelet_sizes.values()))
            # plt.plot(X_test[test_ts_id].ravel())
            # for idx_shp, shp in enumerate(shp_clf.shapelets_):
            #     t0 = predicted_locations[test_ts_id, idx_shp]
            #     plt.plot(np.arange(t0, t0 + len(shp)), shp, linewidth=2)

            # plt.tight_layout()
            # plt.savefig('test.png', format='png')

            local_acc = balanced_accuracy_score(y_true, y_pred)
            acc.append(local_acc)
        # print('acc', acc)
        # print('final acc', np.mean(acc))
        # print('final acc std', np.std(acc))

        if not os.path.exists(os.path.join(results_base_path, 'regression')):
            os.makedirs(os.path.join(results_base_path, 'regression'))
        results_file = os.path.join(results_base_path, 'regression',
                                    'shapelet_' + str(len_feats) + '.txt')

        with open(results_file, 'w') as w:
            # w.write("Confusion Matrix\n")
            # # w.write(confusion_matrix(y_true, y_pred).tolist())
            # w.write('{}\n\n'.format(confusion_matrix(y_true, y_pred)))

            w.write('regression: {} ({})\n'.format(np.mean(acc_list),
                                                   np.std(acc_list)))
            w.write('baseline: {} ({})'.format(np.mean(acc_baseline),
                                               np.std(acc_baseline)))
            w.write("\nFeature Importance\n")
            for i in tot_importance:
                tot_importance[i] = np.mean(tot_importance[i])
            for i in sorted(tot_importance.items(),
                            key=operator.itemgetter(1),
                            reverse=True):
                w.write("{} = {}\n".format(i[0], i[1]))
Beispiel #16
0
    def fit(self, X, y):
        """Fit the model using X as training data and y as target values
        Parameters
        ----------
        X : {array-like}
            Training data. Shape [n_samples, n_features].
        y : {array-like, sparse matrix}
            Target values of shape = [n_samples] or [n_samples, n_outputs]
        """

        self.X = X
        self.y = y

        n_shapelets_per_size = self.shapelet_model_params.get(
            "n_shapelets_per_size", "heuristic")
        if n_shapelets_per_size == "heuristic":
            n_ts, ts_sz = X.shape[:2]
            n_classes = len(set(y))
            n_shapelets_per_size = grabocka_params_to_shapelet_size_dict(
                n_ts=n_ts,
                ts_sz=ts_sz,
                n_classes=n_classes,
                l=self.shapelet_model_params.get("l", 0.1),
                r=self.shapelet_model_params.get("r", 2))

        shp_clf = ShapeletModel(
            n_shapelets_per_size=n_shapelets_per_size,
            optimizer=self.shapelet_model_params.get("optimizer", "sgd"),
            weight_regularizer=self.shapelet_model_params.get(
                "weight_regularizer", .01),
            max_iter=self.shapelet_model_params.get("max_iter", 100),
            random_state=self.random_state,
            verbose=self.shapelet_model_params.get("verbose", 0))

        shp_clf.fit(X, y)
        X_transformed = shp_clf.transform(X)
        self.X_transformed = X_transformed

        if self.tau is not None:
            self.X_thresholded = 1 * (self.X_transformed < self.tau)
            clf = DecisionTreeClassifier()
            param_grid = self.decision_tree_grid_search_params
            grid = GridSearchCV(clf,
                                param_grid=param_grid,
                                scoring='accuracy',
                                n_jobs=-1,
                                verbose=0)
            grid.fit(self.X_thresholded, y)
        else:
            grids = []
            grids_scores = []
            for quantile in self.tau_quantiles:
                _X_thresholded = 1 * (self.X_transformed < (np.quantile(
                    self.X_transformed, quantile)))
                clf = DecisionTreeClassifier()
                param_grid = self.decision_tree_grid_search_params
                grid = GridSearchCV(clf,
                                    param_grid=param_grid,
                                    scoring='accuracy',
                                    n_jobs=-1,
                                    verbose=0)
                grid.fit(_X_thresholded, y)
                grids.append(grid)
                grids_scores.append(grid.best_score_)
            grid = grids[np.argmax(np.array(grids_scores))]
            best_quantile = self.tau_quantiles[np.argmax(
                np.array(grids_scores))]
            self.tau = np.quantile(self.X_transformed, best_quantile)
            self.X_thresholded = 1 * (self.X_transformed < self.tau)

        clf = DecisionTreeClassifier(**grid.best_params_)
        clf.fit(self.X_thresholded, y)
        if self.prune_duplicate_tree_leaves:
            prune_duplicate_leaves(
                clf)  # FIXME: does it influence the .tree properties?

        self.decision_tree = clf
        self.decision_tree_explorable = NewTree(clf)
        self.decision_tree_explorable.build_tree()
        self._shapelet_model = shp_clf
        self._build_tree_graph()

        return self
        L = np.random.choice([0.025, 0.075, 0.125, 0.175, 0.2])
        R = np.random.choice([1, 2, 3])
        _lambda = np.random.choice([0.01, 0.1, 1])
        n_iterations = np.random.choice([2000, 5000, 10000])

        shapelet_dict = grabocka_params_to_shapelet_size_dict(
            X_train.shape[0], X_train.shape[1], int(K * X_train.shape[1]), L,
            R)
        clf = ShapeletModel(n_shapelets_per_size=shapelet_dict,
                            max_iter=n_iterations,
                            verbose_level=0,
                            batch_size=1,
                            optimizer='sgd',
                            weight_regularizer=_lambda)

        clf.fit(np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)),
                y_train)

        X_distances_train = clf.transform(X_train)
        X_distances_test = clf.transform(X_test)

        lr = GridSearchCV(LogisticRegression(), {
            'penalty': ['l1', 'l2'],
            'C': [0.001, 0.01, 0.1, 1.0, 10.0]
        })
        lr.fit(X_distances_train, y_train)

        acc = accuracy_score(y_test, lr.predict(X_distances_test))

        print([K, L, R, _lambda, n_iterations], acc)

        lts_results.append([K, L, R, _lambda, n_iterations, acc])
Beispiel #18
0
class Shapelets():
    def __init__(self,
                 epochs,
                 length,
                 num_shapelet_lengths,
                 num_shapelets,
                 learning_rate,
                 weight_regularizer,
                 batch_size=256,
                 optimizer=Adam):
        '''
            initialize shapelet hyperparameters

            hyperparameters:
                epochs                : number of training epochs
                length                : base shapelet length, expressed as fraction of length of time series
                num_shapelet_lengths  : number of different shapelet lengths
                num_shapelets         : number of unique shapelets to learn at each shapelet length, 
                                        expressed as fraction of length of time series
                learning rate         : learning rate of Keras optimizer
                weight regularizer    : weight regularization used when fitting model
        '''
        self.epochs = epochs
        self.length = length
        self.num_shapelet_lengths = num_shapelet_lengths
        self.num_shapelets = num_shapelets
        self.weight_regularizer = weight_regularizer
        self.batch_size = batch_size
        self.optimizer = optimizer(lr=learning_rate)
        self.shapelet_sizes = None
        self.shapelet_clf = None
        self.encoder = LabelEncoder()

    def clear_session(self):
        try:
            assert (self.shapelet_clf is not None)
        except:
            raise ValueError(
                "Cannot clear session that has not been initialized")
        self.shapelet_clf.clear_session()
        return

    def load_model(self, series_length, labels, checkpoint):
        '''
            Load model from checkpoint into Shapelet classifier
        '''
        if self.shapelet_clf is None:
            base_size = int(self.length * series_length)
            self.shapelet_sizes = {}
            for sz_idx in range(self.num_shapelet_lengths):
                shp_sz = base_size * (sz_idx + 1)
                self.shapelet_sizes[shp_sz] = int(self.num_shapelets *
                                                  series_length)
            self.shapelet_clf = ShapeletModel(
                n_shapelets_per_size=self.shapelet_sizes,
                optimizer=self.optimizer,
                weight_regularizer=self.weight_regularizer,
                max_iter=self.epochs,
                batch_size=self.batch_size)

        # first generate new model into which to load the weights
        self.encode(labels)
        self.shapelet_clf.generate_model(series_length,
                                         len(self.get_classes()))

        # load weights
        self.shapelet_clf.model.load_weights(checkpoint)

    def fit_transfer_model(self,
                           X_train,
                           y_train,
                           checkpoint,
                           nclasses_prior=2,
                           source_dir=None,
                           val_data=None):
        # encode training and validation labels
        y_train = self.encode(y_train)
        y_val = self.encode(val_data[1])

        # scale training and validation data to between 0 and 1
        X_train_scaled = self.__ScaleData(X_train)
        X_val_scaled = self.__ScaleData(val_data[0])

        if self.shapelet_clf is None:
            base_size = int(self.length * X_train.shape[1])
            self.shapelet_sizes = {}
            for sz_idx in range(self.num_shapelet_lengths):
                shp_sz = base_size * (sz_idx + 1)
                self.shapelet_sizes[shp_sz] = int(self.num_shapelets *
                                                  X_train.shape[1])
            self.shapelet_clf = ShapeletModel(
                n_shapelets_per_size=self.shapelet_sizes,
                optimizer=self.optimizer,
                weight_regularizer=self.weight_regularizer,
                max_iter=self.epochs,
                batch_size=self.batch_size)

        # fit shapelet classifier
        self.shapelet_clf.fit_transfer_model(X_train_scaled, y_train,
                                             nclasses_prior, checkpoint,
                                             source_dir, (X_val_scaled, y_val))

    def fit(self, X_train, y_train, source_dir=None, val_data=None):
        '''
            fit shapelet classifier on training data

            parameters:
                X_train                : training time series
                y_train                : training labels
        '''
        if self.shapelet_clf is None:
            base_size = int(self.length * X_train.shape[1])
            self.shapelet_sizes = {}
            for sz_idx in range(self.num_shapelet_lengths):
                shp_sz = base_size * (sz_idx + 1)
                self.shapelet_sizes[shp_sz] = int(self.num_shapelets *
                                                  X_train.shape[1])
            self.shapelet_clf = ShapeletModel(
                n_shapelets_per_size=self.shapelet_sizes,
                optimizer=self.optimizer,
                weight_regularizer=self.weight_regularizer,
                max_iter=self.epochs,
                batch_size=self.batch_size)

        # encode training and validation labels
        y_train = self.encode(y_train)
        y_val = self.encode(val_data[1])

        # scale training and validation data to between 0 and 1
        X_train_scaled = self.__ScaleData(X_train)
        X_val_scaled = self.__ScaleData(val_data[0])

        # fit classifier
        self.shapelet_clf.fit(X_train_scaled, y_train, source_dir,
                              (X_val_scaled, y_val))

    def __ScaleData(self, input_data):
        ''' 
            scale input data to range [0,1]

            parameters:
                input_data        : input data to rescale
        '''

        return TimeSeriesScalerMinMax().fit_transform(input_data)

    def predict(self, X_test):
        '''
            classifications for time series in test data set

            parameters:
                X_test:     test time series on which to predict classes

            returns: classifications for test data set
        '''
        X_test_scaled = self.__ScaleData(X_test)
        return self.shapelet_clf.predict(X_test_scaled)

    def predict_proba(self, X_test):
        '''
            class probabilities for time series in test data set

            parameters:
                X_test:     test time series on which to predict classes

            returns: classifications for test data set
        '''
        X_test_scaled = self.__ScaleData(X_test)
        return self.shapelet_clf.predict_proba(X_test_scaled)

    def encode(self, categories):
        '''
            fit label encoder on input categories. returns transformed categories
        '''
        self.encoder.fit(categories)
        return self.encoder.transform(categories)

    def decode(self, y_probs, p_threshold):
        '''
            decode prediction probabilities y_probs into prediction / confidence give p_threshold
        '''
        prob_max = np.amax(y_probs, axis=1)
        prediction_indices = prob_max > p_threshold
        y_pred = np.zeros(y_probs.shape[0])

        # reintepret confidence in binary case
        if y_probs.shape[1] == 1:
            y_pred[prediction_indices] = 1
            confidence = (prob_max - p_threshold) / (y_pred - p_threshold)
            confidence = 0.5 + confidence / 2
        else:
            y_pred[prediction_indices] = np.argmax(y_probs,
                                                   axis=1)[prediction_indices]
            confidence = prob_max
        y_pred = y_pred.astype(int)
        y_preds = self.encoder.inverse_transform(y_pred)

        return y_preds, confidence

    def get_classes(self):
        '''
            get original classes from encoder
        '''
        try:
            assert (self.encoder is not None)
        except:
            raise ValueError("Encoder has not been initialized")
        return self.encoder.classes_

    def VisualizeShapelets(self):
        '''
            visualize all of shapelets learned by shapelet classifier
        '''
        plt.figure()
        for i, sz in enumerate(self.shapelet_sizes.keys()):
            plt.subplot(len(self.shapelet_sizes), 1, i + 1)
            plt.title("%d shapelets of size %d" %
                      (self.shapelet_sizes[sz], sz))
            for shapelet in self.shapelet_clf.shapelets_:
                if ts_size(shapelet) == sz:
                    plt.plot(shapelet.ravel())
            plt.xlim([0, max(self.shapelet_sizes.keys())])
        plt.tight_layout()
        plt.show()

    def VisualizeShapeletLocations(self,
                                   series_values,
                                   series_id,
                                   save_dir='visualizations',
                                   name='shp_1'):
        '''
            visualize shapelets superimposed on one of the test series

            parameters:
                series_values:      raw values on which to visualize shapelets
                series_id:          id of time series to visualize  
                save_dir            directory in which to save visualizations
                name                name under which to save viz (bc unique every time)
                n_shapelets:        
        '''

        plt.style.use("seaborn-whitegrid")

        # NK brand colors
        COLORS = [
            "#FA5655", "#F79690", "#B9BC2D", "#86B6B2", "#955B99", "#252B7A"
        ]
        # others? "#8D6B2C",
        # "#D0A826",
        # "#FEDB03",
        # "#000000",
        # "#454545",
        # "#FFFFFF",
        # "#F8F6F1"]
        n_rows, n_cols, _ = series_values.shape
        test_series = series_values[series_id].reshape(-1, )

        closest_inds = self.shapelet_clf.locate(test_series.reshape(1, -1,
                                                                    1))[0]
        closest_dists = []
        for ind, shp in zip(closest_inds, self.shapelet_clf.shapelets_):
            closest_dists.append(
                np.linalg.norm(test_series[ind:ind + shp.shape[0]] - shp))
        closest_dists = np.array(closest_dists)

        # convert distance to weight where dist=0 -> wt=1 and dist=inf -> wt=0
        sl_weights = 1 / (1 + closest_dists)
        # plot the signal with matching shapelets color overlayed
        plt.clf()
        plt.plot(range(n_cols), test_series, color="k")
        for ind, sl, wt, color in zip(closest_inds,
                                      self.shapelet_clf.shapelets_, sl_weights,
                                      COLORS):
            # find closest match
            t = range(ind, ind + sl.shape[0])
            match = test_series[ind:ind + sl.shape[0]]
            # plot shapelet on top of signal width width and alpha set by shapelet weight
            plt.plot(t, match, alpha=7 * wt, linewidth=35 * wt, color=color)
        plt.ylabel('Email Density')
        plt.xlabel('Minute of the Hour')
        plt.show()
        #plt.savefig(save_dir + "/{}_signal_size_{}_id_{}.png".format(name, n_cols, series_id))

        # plot shapelets
        plt.clf()
        # to plot the shapelets, switch to dark background
        plt.style.use("seaborn-darkgrid")
        # ax = plt.axes()  # used below for sharex, sharey (if needed?)

        # arange shapletes in grid - find greatest factor of n_shapelets
        gf = 0
        shp_t = self.shapelet_clf.shapelets_as_time_series_
        shp = self.shapelet_clf.shapelets_
        for i in range(1, shp.shape[0]):
            if shp.shape[0] % i == 0:
                gf = i
        of = int(shp.shape[0] / gf)
        n_cols = 2
        for i in range(shp_t.shape[0]):
            ax_i = plt.subplot(gf, of, i + 1)
            # we could force them to share the same axes
            # ax_i = plt.subplot(n_rows, n_cols, i + 1, sharex=ax, sharey=ax)
            #ax_i.set_xticklabels([])
            ax_i.set_yticklabels([])
            plt.plot(range(shp_t.shape[1]),
                     shp[i].reshape(-1),
                     color=COLORS[i % len(COLORS)],
                     linewidth=3)
            plt.xlabel('Shapelet Length')
        plt.show()