def fit_predict(self, y):
        # Append value at the position of last vector in the y selection
        dists = np.append(compute_distance(y, cosine_distances, False), 1000)

        n_clusters_current = y.shape[0]

        while n_clusters_current != self.n_clusters:
            cluster1_center = np.argmin(dists)

            cluster1_size = self._get_cluster_size(dists, cluster1_center)

            cluster2_center = cluster1_center + cluster1_size

            cluster2_size = self._get_cluster_size(dists, cluster2_center)

            new_cluster_size = cluster1_size + cluster2_size

            # Compute new cluster center
            y[cluster1_center, :] = np.sum(
                [y[cluster1_center, :] * cluster1_size, y[cluster2_center, :] * cluster2_size],
                axis=0) / new_cluster_size

            # Set the invalidated distances to 10
            dists[cluster2_center] = 10

            # Set the invalidated predictions to 0
            # just for debugging, because clusters are mapped using the vector of distances
            # y[cluster2_center, :] = 0

            # Recompute the distance to the previous cluster if the current cluster is not at the beginning of sequence
            if cluster1_center != 0:
                prev_cluster_center = self._get_previous_cluster_center(dists, cluster1_center)
                dists[prev_cluster_center] = cosine_distances(y[prev_cluster_center:prev_cluster_center + 1, :],
                                                              y[cluster1_center:cluster1_center + 1, :])

            # Recompute the distance to the next cluster
            # If the current cluster is at the end of sequence set big dummy value
            next_cluster_center = cluster1_center + new_cluster_size
            if next_cluster_center == dists.shape[0]:
                dists[cluster1_center] = 100
            else:
                dists[cluster1_center] = cosine_distances(y[cluster1_center:cluster1_center + 1, :],
                                                          y[next_cluster_center:next_cluster_center + 1, :])

            n_clusters_current -= 1

        # Convert cluster distances to cluster labels
        label = -1
        labels = np.zeros((dists.shape[0]), dtype="uint8")
        for j in range(0, dists.shape[0]):
            if dists[j] != 10:
                label += 1
            labels[j] = label

        return labels
Example #2
0
            'Do you want to use the model trained on cosine distances [c] or on raw SVM predictions [r]?',
            'c', 'r'):
        cosine = True
        model = load_model(config.lstm_model_1)
    else:
        cosine = False
        model = load_model(config.lstm_model_577)

    time_steps = model.get_config()[0]['config']['batch_input_shape'][1]

    held_out = config.get_seg_data('held_out')

    X_held = np.load(held_out['y'])
    y_held = np.load(held_out['y_true_lm'])

    if cosine:
        print("Computing the distances")
        X_held = compute_distance(X_held, cosine_distances)
    else:
        y_held = np.append(0, y_held)

    X = split_to_time_steps(X_held)
    y_true = split_to_time_steps(y_held)

    y_pred = model.predict(X)

    plot_thresholds(y_true.flatten(),
                    y_pred.flatten(),
                    False,
                    average_type="binary")
Example #3
0
    if first_option('Do you want to use the model trained on cosine distances [c] or on raw SVM predictions [r]?',
                    'c', 'r'):
        cosine = True
        model = load_model(config.lstm_model_1)
        T = 0.41
    else:
        cosine = False
        model = load_model(config.lstm_model_577)
        T = 0.48

    time_steps = model.get_config()[0]['config']['batch_input_shape'][1]

    test = config.get_seg_data('test')

    X_test = np.load(test['y'])
    y_test = np.load(test['y_true_lm'])

    if cosine:
        print("Computing the distances")
        X_test = compute_distance(X_test, cosine_distances)
    else:
        y_test = np.append(0, y_test)

    X = split_to_time_steps(X_test)
    y_true = split_to_time_steps(y_test)

    y_pred = model.predict(X) > T

    print_measurements(y_true, y_pred)
from sklearn.svm import LinearSVC, SVC

import config
from segmentation.distance_based_methods import compute_distance
from utils import save_pickle, first_option, load_sparse_csr

if __name__ == '__main__':
    data = config.get_seg_data('train')

    if first_option('Do you want to use linear [l] or RBF [r] kernel?', 'l',
                    'r'):
        path = config.classifier_linear
        classifier = LinearSVC(random_state=0)
    else:
        path = config.classifier_rbf
        classifier = SVC(random_state=0, kernel='rbf')

    y_true = np.load(data['y_true_lm'])

    print("Loading x")
    x = load_sparse_csr(data['x'])

    print("Computing cosine distance")
    x_dists = compute_distance(x)

    print("Classifier training")
    classifier.fit(x_dists, y_true)

    print("Saving th classifier to: " + path)
    save_pickle(path, classifier)
    # Create dir for histories
    create_dir(config.hist_dir)

    print("Loading the data")
    train = config.get_seg_data('train')
    held_out = config.get_seg_data('held_out')

    X_train_or = np.load(train['y'])
    y_train_or = np.load(train['y_true_lm'])

    X_held_out = np.load(held_out['y'])
    y_held_out = np.load(held_out['y_true_lm'])

    print("Computing the distances on test data")
    X_held_out = compute_distance(X_held_out, cosine_distances)

    # Split the 2D matrix to 3D matrix of dimensions [samples, time_steps, features]
    X_held_out = split_to_time_steps(X_held_out)

    # Split the 1D vector to 2D matrix of dimensions: [samples, time_steps]
    y_held_out = split_to_time_steps(y_held_out)

    y_held_out = np.reshape(y_held_out,
                            (y_held_out.shape[0], y_held_out.shape[1], 1))

    # Append 0 value to the beginning of y so the values represent if there was a boundary between current sample and
    # the previous one, not the current and next
    y_train_or = np.append(0, y_train_or)
    shuffling_epochs = 2
    for i in range(shuffling_epochs):
Example #6
0
    print('Transforming corpus by vectorizer')
    corpus_tfidf = vectorizer.transform(corpus)

    print('Loading the classifier')
    classifier = load_pickle(config.classifier)

    X = classifier.predict_proba(corpus_tfidf)

    del corpus, corpus_tfidf, classifier, vectorizer

    # LSTM part
    if first_option(
            'Do you want to use the model trained on cosine distances [c] or on raw SVM predictions [r]?',
            'c', 'r'):
        print("Computing the distances")
        X = compute_distance(X, cosine_distances)
        model = load_model(config.lstm_model_1)
        T = 0.41
        # y_true = y_true[1:]
        assert len(X) == len(
            y_true), "Dimensions do not match: y.shape = " + str(
                y_true.shape) + " X.shape = " + str(X.shape)
    else:
        cosine = False
        model = load_model(config.lstm_model_577)
        T = 0.48

    time_steps = model.get_config()[0]['config']['batch_input_shape'][1]

    X = split_to_time_steps(X)
    y_true = split_to_time_steps(y_true)