Example #1
0
def test_embeddings_image(log_dir):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[TensorBoardEmbeddingsImage(X, Y, log_dir, epoch_interval=1)])    

    model.fit_transform(X)
    assert os.path.exists(os.path.join(log_dir, 'embeddings'))
Example #2
0
def test_embeddings_image(log_dir):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    filename = 'embeddings_{}.png'
    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[EmbeddingsImage(X, Y, log_dir, filename, epoch_interval=1)])    

    model.fit_transform(X)
    assert os.path.exists(os.path.join(log_dir, filename.format(n_epochs)))
Example #3
0
def test_model_checkpoint(log_dir):
    iris = datasets.load_iris()
    X = iris.data

    filename = 'model-checkpoint_{}.ivis'
    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[ModelCheckpoint(log_dir, filename, epoch_interval=1)])

    model.fit_transform(X)
    model_2 = Ivis()
    model_2.load_model(os.path.join(log_dir, filename.format(n_epochs)))

    # Test continuing training
    model_2.fit_transform(X)
Example #4
0
def test_custom_loss_ivis_callable(model_filepath):
    iris = datasets.load_iris()
    X = iris.data

    class EuclideanDistance:
        def __init__(self, margin=1):
            self.margin = margin
            self.__name__ = self.__class__.__name__

        def _euclidean_distance(self, x, y):
            return K.sqrt(
                K.maximum(K.sum(K.square(x - y), axis=-1, keepdims=True),
                          K.epsilon()))

        def __call__(self, y_true, y_pred):
            anchor, positive, negative = tf.unstack(y_pred)
            return K.mean(
                K.maximum(
                    self._euclidean_distance(anchor, positive) -
                    self._euclidean_distance(anchor, negative) + self.margin,
                    0))

    model = Ivis(distance=EuclideanDistance(margin=2),
                 k=15,
                 batch_size=16,
                 epochs=5)
    y_pred = model.fit_transform(X)

    # Test model saving and loading
    model.save_model(model_filepath, overwrite=True)
    model_2 = Ivis(distance=EuclideanDistance(margin=2))
    model_2.load_model(model_filepath)
    model_2.fit(X)
class IvisWrapper():
    def __init__(self, dims):
        self.k = dims

    def fit_transform(self, data):
        self.model = Ivis(embedding_dims=self.k, k=8)
        x = self.model.fit_transform(data)
        os.remove(self.model.annoy_index_path)  # necessary cleanup
        return x
Example #6
0
def test_1d_supervied_iris_embedding():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    ivis_iris = Ivis(epochs=2, embedding_dims=1)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x, y)
Example #7
0
def test_iris_embedding():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    ivis_iris = Ivis(n_epochs_without_progress=5)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x)
Example #8
0
def test_score_samples_unsupervised():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    ivis_iris = Ivis(k=15, batch_size=16, epochs=2)
    embeddings = ivis_iris.fit_transform(x)

    # Unsupervised model cannot classify
    with pytest.raises(Exception):
        y_pred = ivis_iris.score_samples(x)
Example #9
0
def test_embeddings_logging(log_dir):
    iris = datasets.load_iris()
    X = iris.data

    filename = 'embeddings_{}.npy'
    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[EmbeddingsLogging(X, log_dir, filename, epoch_interval=1)])

    y_pred = model.fit_transform(X)
    embeddings = np.load(os.path.join(log_dir, filename.format(n_epochs)))
Example #10
0
def test_multidimensional_inputs():
    sample_data = np.ones(shape=(32, 8, 8, 3))

    base_model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 3)),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.GlobalAveragePooling2D()
    ])

    model = Ivis(model=base_model, epochs=5, k=4, batch_size=4)
    y_pred = model.fit_transform(sample_data)
Example #11
0
def test_iris_embedding():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target
    mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False)
    y[mask] = -1

    ivis_iris = Ivis(epochs=5)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x, y)
Example #12
0
def test_correctly_indexed_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    embeddings = ivis_iris.fit_transform(x, y)
Example #13
0
def test_custom_ndarray_neighbour_matrix():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    class_indicies = {label: np.argwhere(y == label).ravel() for label in np.unique(y)}
    neighbour_matrix = np.array([class_indicies[label] for label in y])

    ivis_iris = Ivis(epochs=5, neighbour_matrix=neighbour_matrix)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x)
Example #14
0
def test_invalid_metric():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'invalid_loss_function'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    # Loss function not specified
    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)
Example #15
0
def test_svm_score_samples():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'categorical_hinge'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    # Incorrectly formatted labels from SVM
    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)

    # Correctly formatted labels train successfully
    y = to_categorical(y) * 2 - 1
    embeddings = ivis_iris.fit_transform(x, y)

    y_pred = ivis_iris.score_samples(x)
    assert ivis_iris.model_.loss['supervised'] == supervision_metric
    assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear'
    assert ivis_iris.model_.layers[-1].kernel_regularizer is not None
    assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
Example #16
0
def test_non_consecutive_indexed_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    # Make labels non-consecutive indexed
    y[y == max(y)] = max(y) + 1

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)
Example #17
0
def test_correctly_indexed_semi_supervised_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    # Mark points as unlabeled
    mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False)
    y[mask] = -1

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    embeddings = ivis_iris.fit_transform(x, y)
Example #18
0
def test_supervised_model_saving(model_filepath):
    model = Ivis(k=15,
                 batch_size=16,
                 epochs=5,
                 supervision_metric='sparse_categorical_crossentropy')
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    model.fit(X, Y)
    model.save_model(model_filepath, overwrite=True)

    model_2 = Ivis()
    model_2.load_model(model_filepath)

    # Check that model embeddings are same
    assert np.all(model.transform(X) == model_2.transform(X))
    # Check that model supervised predictions are same
    assert np.all(model.score_samples(X) == model_2.score_samples(X))
    # Serializable dict eles same
    assert model.__getstate__() == model_2.__getstate__()

    # Check all weights are the same
    for model_layer, model_2_layer in zip(model.encoder.layers,
                                          model_2.encoder.layers):
        model_layer_weights = model_layer.get_weights()
        model_2_layer_weights = model_2_layer.get_weights()
        for i in range(len(model_layer_weights)):
            assert np.all(model_layer_weights[i] == model_2_layer_weights[i])

    # Check optimizer weights are the same
    for w1, w2 in zip(model.model_.optimizer.get_weights(),
                      model_2.model_.optimizer.get_weights()):
        assert np.all(w1 == w2)

    # Check that trying to save over an existing folder raises an Exception
    with pytest.raises(FileExistsError) as exception_info:
        model.save_model(model_filepath)
        assert isinstance(exception_info.value, FileExistsError)

    # Check that can overwrite existing model if requested
    model.save_model(model_filepath, overwrite=True)

    # Train new model
    y_pred_2 = model_2.fit_transform(X, Y)
Example #19
0
def test_custom_model_saving(model_filepath):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    # Create a custom model
    inputs = tf.keras.layers.Input(shape=(X.shape[-1], ))
    x = tf.keras.layers.Dense(128, activation='relu')(inputs)
    custom_model = tf.keras.Model(inputs, x)

    model = Ivis(k=15,
                 batch_size=16,
                 epochs=5,
                 supervision_metric='sparse_categorical_crossentropy',
                 model=custom_model)

    model.fit(X, Y)
    model.save_model(model_filepath, overwrite=True)

    model_2 = Ivis()
    model_2.load_model(model_filepath)

    # Check that model embeddings are same
    assert np.all(model.transform(X) == model_2.transform(X))
    # Check that model supervised predictions are same
    assert np.all(model.score_samples(X) == model_2.score_samples(X))
    # Serializable dict eles same
    assert model.__getstate__() == model_2.__getstate__()

    # Check all weights are the same
    for model_layer, model_2_layer in zip(model.encoder.layers,
                                          model_2.encoder.layers):
        model_layer_weights = model_layer.get_weights()
        model_2_layer_weights = model_2_layer.get_weights()
        for i in range(len(model_layer_weights)):
            assert np.all(model_layer_weights[i] == model_2_layer_weights[i])

    # Check optimizer weights are the same
    for w1, w2 in zip(model.model_.optimizer.get_weights(),
                      model_2.model_.optimizer.get_weights()):
        assert np.all(w1 == w2)

    # Train new model
    y_pred_2 = model_2.fit_transform(X, Y)
    def _reduce_dims(self, arg):
        """
        Uses ivis to reduce dimensionality to 2.

        :param {Iterable} arg - an array-like object
        :return {np.ndarray} embedded object
        """
        m = arg.shape[0]
        if m > 200:
            k = int(0.01 * m)
        elif m > 50:
            k = int(0.1 * m)
        elif m > 10:
            k = int(0.2 * m)
        else:
            k = max(int(0.4 * m), m - 3)

        ivis = Ivis(embedding_dims=self.embedding_dims, k=k, batch_size=2)
        return ivis.fit_transform(arg)
Example #21
0
def test_h5_file(h5_filepath):
    rows, dims = 258, 32
    create_random_dataset(h5_filepath, rows, dims)

    # Load data
    with h5py.File(h5_filepath, 'r') as f:
        X_train = f['data']
        y_train = f['labels']

        # Train and transform with ivis
        model = Ivis(epochs=5,
                     k=15,
                     batch_size=16,
                     precompute=False,
                     build_index_on_disk=False)
        y_pred = model.fit_transform(X_train, shuffle_mode='batch')

        assert y_pred.shape[0] == len(X_train)
        assert y_pred.shape[1] == model.embedding_dims
Example #22
0
def test_non_consecutive_indexed_semi_supervised_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    # Make labels non-consecutive indexed
    y[y == max(y)] = max(y) + 1

    # Mark points as unlabeled
    mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False)
    y[mask] = -1

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)
Example #23
0
def test_h5_file(h5_filepath):
    rows, dims = 258, 32
    create_random_dataset(h5_filepath, rows, dims)

    # Load data
    test_index = rows // 5
    X_train = HDF5Matrix(h5_filepath, 'data', start=0, end=test_index)
    y_train = HDF5Matrix(h5_filepath, 'labels', start=0, end=test_index)

    X_test = HDF5Matrix(h5_filepath, 'data', start=test_index, end=rows)
    y_test = HDF5Matrix(h5_filepath, 'labels', start=test_index, end=rows)

    # Train and transform with ivis
    ivis_iris = Ivis(epochs=5, k=15, batch_size=16)

    y_pred_iris = ivis_iris.fit_transform(X_train, shuffle_mode='batch')
    y_pred = ivis_iris.transform(X_test)

    assert y_pred.shape[0] == len(X_test)
    assert y_pred.shape[1] == ivis_iris.embedding_dims
Example #24
0
def test_score_samples():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    embeddings = ivis_iris.fit_transform(x, y)
    y_pred = ivis_iris.score_samples(x)

    # Softmax probabilities add to one, correct shape
    assert np.sum(y_pred, axis=-1) == pytest.approx(1, 0.01)
    assert y_pred.shape[0] == x.shape[0]
    assert y_pred.shape[1] == len(np.unique(y))

    # Check that loss function and activation are correct
    assert ivis_iris.model_.loss['supervised'] == supervision_metric
    assert ivis_iris.model_.layers[-1].activation.__name__ == 'softmax'
Example #25
0
def test_custom_loss_ivis(model_filepath):
    iris = datasets.load_iris()
    X = iris.data

    def euclidean_loss(y_true, y_pred):
        margin = 1
        anchor, positive, negative = tf.unstack(y_pred)
        return K.mean(
            K.maximum(
                euclidean_distance(anchor, positive) -
                euclidean_distance(anchor, negative) + margin, 0))

    model = Ivis(distance=euclidean_loss, k=15, batch_size=16, epochs=3)
    y_pred = model.fit_transform(X)

    # Test model saving and loading
    model.save_model(model_filepath, overwrite=True)
    model_2 = Ivis(distance=euclidean_loss)
    model_2.load_model(model_filepath)

    model_3 = Ivis()
    with pytest.raises(ValueError):
        model_3.load_model(model_filepath)
Example #26
0
def test_svm_score_samples():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'categorical_hinge'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    # Correctly formatted one-hot labels train successfully
    y = to_categorical(y)
    embeddings = ivis_iris.fit_transform(x, y)

    y_pred = ivis_iris.score_samples(x)

    loss_name = ivis_iris.model_.loss['supervised'].__name__
    assert losses.get(loss_name).__name__ == losses.get(
        supervision_metric).__name__
    assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear'
    assert ivis_iris.model_.layers[-1].kernel_regularizer is not None
    assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
Example #27
0
def ivis(adata,
         model=None,
         use_rep=None,
         n_pcs=None,
         embedding_dims=2,
         k=150,
         distance='pn',
         batch_size=128,
         epochs=1000,
         n_epochs_without_progress=50,
         margin=1,
         ntrees=50,
         search_k=-1,
         precompute=True,
         copy=False):
    """\
    ivis

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    {doc_n_pcs}
    {use_rep}
    embedding_dims : int, optional (default: 2)
        Number of dimensions in the embedding space
    
    k : int, optional (default: 150)
        The number of neighbours to retrieve for each point. Must be less than one minus the number of rows in the dataset.
    distance : string, optional (default: "pn")
        The loss function used to train the neural network. One of "pn", "euclidean", "softmax_ratio_pn", "softmax_ratio".
    
    batch_size : int, optional (default: 128)
        The size of mini-batches used during gradient descent while training the neural network. Must be less than the num_rows in the dataset.
    epochs : int, optional (default: 1000)
        The maximum number of epochs to train the model for. Each epoch the network will see a triplet based on each data-point once.
    n_epochs_without_progress : int, optional (default: 50)
        After n number of epochs without an improvement to the loss, terminate training early.
    margin : float, optional (default: 1)
        The distance that is enforced between points by the triplet loss functions
    ntrees : int, optional (default: 50)
        The number of random projections trees built by Annoy to approximate KNN. The more trees the higher the memory usage, but the better the accuracy of results.
    search_k : int, optional (default: -1)
        The maximum number of nodes inspected during a nearest neighbour query by Annoy. The higher, the more computation time required, but the higher the accuracy. The default 
        is n_trees * k, where k is the number of neighbours to retrieve. If this is set too low, a variable number of neighbours may be retrieved per data-point.
    precompute : boolean, optional (default: True)
        Whether to pre-compute the nearest neighbours. Pre-computing is significantly faster, but requires more memory. If memory is limited, try setting this to False.
    
    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    X_ivis : `np.ndarray` (`adata.obs`, dtype `float`)
        IVIS coordinates of data.    
    """
    logg.info('computing IVIS', r=True)
    adata = adata.copy() if copy else adata

    X = choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
    params_ivis = {
        'model': model,
        'embedding_dims': embedding_dims,
        'k': k,
        'distance': distance,
        'batch_size': batch_size,
        'epochs': epochs,
        'n_epochs_without_progress': n_epochs_without_progress,
        'margin': margin,
        'ntrees': ntrees,
        'search_k': search_k,
        'precompute': precompute
    }
    from ivis import Ivis

    ivis_model = Ivis(**params_ivis)
    X_ivis = ivis_model.fit_transform(X)

    adata.obsm['X_ivis'] = X_ivis
    logg.info('    finished',
              time=True,
              end=' ' if settings.verbosity > 2 else '\n')
    logg.hint('added\n' '    \'X_ivis\', IVIS coordinates (adata.obsm)')
    return adata if copy else None
import pandas as pd
import umap
from ivis import Ivis
import numpy as np

model = Ivis(embedding_dims=2, k=15)

embeddings = model.fit_transform(X_scaled)

# dimension reduction
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(mnist.data)
# plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1],
#             c=mnist.target, s=0.1, cmap='Spectral');

# cluster
labels = hdbscan.HDBSCAN(
    min_samples=10,
    min_cluster_size=500,
).fit_predict(clusterable_embedding)

# visualize
clustered = (labels >= 0)
plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            c=(0.5, 0.5, 0.5),
            s=0.1,
Example #29
0
Ivis can be easily applied to unstructured datasets, including images.
Here we visualise the MNSIT digits dataset using two-dimensional ivis
embeddings.
"""

import os
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from ivis import Ivis

mnist = fetch_openml('mnist_784', version=1)

ivis = Ivis(model='maaten', verbose=0)
embeddings = ivis.fit_transform(mnist.data)

color = mnist.target.astype(int)

plt.figure(figsize=(8, 8), dpi=150)
plt.scatter(x=embeddings[:, 0],
            y=embeddings[:, 1],
            c=color,
            cmap="Spectral",
            s=0.1)
plt.xlabel('ivis 1')
plt.ylabel('ivis 2')
plt.show()

os.remove('annoy.index')