def test_embeddings_image(log_dir): iris = datasets.load_iris() X = iris.data Y = iris.target n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[TensorBoardEmbeddingsImage(X, Y, log_dir, epoch_interval=1)]) model.fit_transform(X) assert os.path.exists(os.path.join(log_dir, 'embeddings'))
def test_embeddings_image(log_dir): iris = datasets.load_iris() X = iris.data Y = iris.target filename = 'embeddings_{}.png' n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[EmbeddingsImage(X, Y, log_dir, filename, epoch_interval=1)]) model.fit_transform(X) assert os.path.exists(os.path.join(log_dir, filename.format(n_epochs)))
def test_model_checkpoint(log_dir): iris = datasets.load_iris() X = iris.data filename = 'model-checkpoint_{}.ivis' n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[ModelCheckpoint(log_dir, filename, epoch_interval=1)]) model.fit_transform(X) model_2 = Ivis() model_2.load_model(os.path.join(log_dir, filename.format(n_epochs))) # Test continuing training model_2.fit_transform(X)
def test_custom_loss_ivis_callable(model_filepath): iris = datasets.load_iris() X = iris.data class EuclideanDistance: def __init__(self, margin=1): self.margin = margin self.__name__ = self.__class__.__name__ def _euclidean_distance(self, x, y): return K.sqrt( K.maximum(K.sum(K.square(x - y), axis=-1, keepdims=True), K.epsilon())) def __call__(self, y_true, y_pred): anchor, positive, negative = tf.unstack(y_pred) return K.mean( K.maximum( self._euclidean_distance(anchor, positive) - self._euclidean_distance(anchor, negative) + self.margin, 0)) model = Ivis(distance=EuclideanDistance(margin=2), k=15, batch_size=16, epochs=5) y_pred = model.fit_transform(X) # Test model saving and loading model.save_model(model_filepath, overwrite=True) model_2 = Ivis(distance=EuclideanDistance(margin=2)) model_2.load_model(model_filepath) model_2.fit(X)
class IvisWrapper(): def __init__(self, dims): self.k = dims def fit_transform(self, data): self.model = Ivis(embedding_dims=self.k, k=8) x = self.model.fit_transform(data) os.remove(self.model.annoy_index_path) # necessary cleanup return x
def test_1d_supervied_iris_embedding(): iris = datasets.load_iris() x = iris.data y = iris.target ivis_iris = Ivis(epochs=2, embedding_dims=1) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x, y)
def test_iris_embedding(): iris = datasets.load_iris() x = iris.data y = iris.target ivis_iris = Ivis(n_epochs_without_progress=5) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x)
def test_score_samples_unsupervised(): iris = datasets.load_iris() x = iris.data y = iris.target ivis_iris = Ivis(k=15, batch_size=16, epochs=2) embeddings = ivis_iris.fit_transform(x) # Unsupervised model cannot classify with pytest.raises(Exception): y_pred = ivis_iris.score_samples(x)
def test_embeddings_logging(log_dir): iris = datasets.load_iris() X = iris.data filename = 'embeddings_{}.npy' n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[EmbeddingsLogging(X, log_dir, filename, epoch_interval=1)]) y_pred = model.fit_transform(X) embeddings = np.load(os.path.join(log_dir, filename.format(n_epochs)))
def test_multidimensional_inputs(): sample_data = np.ones(shape=(32, 8, 8, 3)) base_model = tf.keras.models.Sequential([ tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 3)), tf.keras.layers.MaxPool2D(), tf.keras.layers.GlobalAveragePooling2D() ]) model = Ivis(model=base_model, epochs=5, k=4, batch_size=4) y_pred = model.fit_transform(sample_data)
def test_iris_embedding(): iris = datasets.load_iris() x = iris.data y = iris.target mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False) y[mask] = -1 ivis_iris = Ivis(epochs=5) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x, y)
def test_correctly_indexed_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) embeddings = ivis_iris.fit_transform(x, y)
def test_custom_ndarray_neighbour_matrix(): iris = datasets.load_iris() x = iris.data y = iris.target class_indicies = {label: np.argwhere(y == label).ravel() for label in np.unique(y)} neighbour_matrix = np.array([class_indicies[label] for label in y]) ivis_iris = Ivis(epochs=5, neighbour_matrix=neighbour_matrix) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x)
def test_invalid_metric(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'invalid_loss_function' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) # Loss function not specified with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y)
def test_svm_score_samples(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'categorical_hinge' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) # Incorrectly formatted labels from SVM with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y) # Correctly formatted labels train successfully y = to_categorical(y) * 2 - 1 embeddings = ivis_iris.fit_transform(x, y) y_pred = ivis_iris.score_samples(x) assert ivis_iris.model_.loss['supervised'] == supervision_metric assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear' assert ivis_iris.model_.layers[-1].kernel_regularizer is not None assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
def test_non_consecutive_indexed_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target # Make labels non-consecutive indexed y[y == max(y)] = max(y) + 1 supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y)
def test_correctly_indexed_semi_supervised_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target # Mark points as unlabeled mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False) y[mask] = -1 supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) embeddings = ivis_iris.fit_transform(x, y)
def test_supervised_model_saving(model_filepath): model = Ivis(k=15, batch_size=16, epochs=5, supervision_metric='sparse_categorical_crossentropy') iris = datasets.load_iris() X = iris.data Y = iris.target model.fit(X, Y) model.save_model(model_filepath, overwrite=True) model_2 = Ivis() model_2.load_model(model_filepath) # Check that model embeddings are same assert np.all(model.transform(X) == model_2.transform(X)) # Check that model supervised predictions are same assert np.all(model.score_samples(X) == model_2.score_samples(X)) # Serializable dict eles same assert model.__getstate__() == model_2.__getstate__() # Check all weights are the same for model_layer, model_2_layer in zip(model.encoder.layers, model_2.encoder.layers): model_layer_weights = model_layer.get_weights() model_2_layer_weights = model_2_layer.get_weights() for i in range(len(model_layer_weights)): assert np.all(model_layer_weights[i] == model_2_layer_weights[i]) # Check optimizer weights are the same for w1, w2 in zip(model.model_.optimizer.get_weights(), model_2.model_.optimizer.get_weights()): assert np.all(w1 == w2) # Check that trying to save over an existing folder raises an Exception with pytest.raises(FileExistsError) as exception_info: model.save_model(model_filepath) assert isinstance(exception_info.value, FileExistsError) # Check that can overwrite existing model if requested model.save_model(model_filepath, overwrite=True) # Train new model y_pred_2 = model_2.fit_transform(X, Y)
def test_custom_model_saving(model_filepath): iris = datasets.load_iris() X = iris.data Y = iris.target # Create a custom model inputs = tf.keras.layers.Input(shape=(X.shape[-1], )) x = tf.keras.layers.Dense(128, activation='relu')(inputs) custom_model = tf.keras.Model(inputs, x) model = Ivis(k=15, batch_size=16, epochs=5, supervision_metric='sparse_categorical_crossentropy', model=custom_model) model.fit(X, Y) model.save_model(model_filepath, overwrite=True) model_2 = Ivis() model_2.load_model(model_filepath) # Check that model embeddings are same assert np.all(model.transform(X) == model_2.transform(X)) # Check that model supervised predictions are same assert np.all(model.score_samples(X) == model_2.score_samples(X)) # Serializable dict eles same assert model.__getstate__() == model_2.__getstate__() # Check all weights are the same for model_layer, model_2_layer in zip(model.encoder.layers, model_2.encoder.layers): model_layer_weights = model_layer.get_weights() model_2_layer_weights = model_2_layer.get_weights() for i in range(len(model_layer_weights)): assert np.all(model_layer_weights[i] == model_2_layer_weights[i]) # Check optimizer weights are the same for w1, w2 in zip(model.model_.optimizer.get_weights(), model_2.model_.optimizer.get_weights()): assert np.all(w1 == w2) # Train new model y_pred_2 = model_2.fit_transform(X, Y)
def _reduce_dims(self, arg): """ Uses ivis to reduce dimensionality to 2. :param {Iterable} arg - an array-like object :return {np.ndarray} embedded object """ m = arg.shape[0] if m > 200: k = int(0.01 * m) elif m > 50: k = int(0.1 * m) elif m > 10: k = int(0.2 * m) else: k = max(int(0.4 * m), m - 3) ivis = Ivis(embedding_dims=self.embedding_dims, k=k, batch_size=2) return ivis.fit_transform(arg)
def test_h5_file(h5_filepath): rows, dims = 258, 32 create_random_dataset(h5_filepath, rows, dims) # Load data with h5py.File(h5_filepath, 'r') as f: X_train = f['data'] y_train = f['labels'] # Train and transform with ivis model = Ivis(epochs=5, k=15, batch_size=16, precompute=False, build_index_on_disk=False) y_pred = model.fit_transform(X_train, shuffle_mode='batch') assert y_pred.shape[0] == len(X_train) assert y_pred.shape[1] == model.embedding_dims
def test_non_consecutive_indexed_semi_supervised_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target # Make labels non-consecutive indexed y[y == max(y)] = max(y) + 1 # Mark points as unlabeled mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False) y[mask] = -1 supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y)
def test_h5_file(h5_filepath): rows, dims = 258, 32 create_random_dataset(h5_filepath, rows, dims) # Load data test_index = rows // 5 X_train = HDF5Matrix(h5_filepath, 'data', start=0, end=test_index) y_train = HDF5Matrix(h5_filepath, 'labels', start=0, end=test_index) X_test = HDF5Matrix(h5_filepath, 'data', start=test_index, end=rows) y_test = HDF5Matrix(h5_filepath, 'labels', start=test_index, end=rows) # Train and transform with ivis ivis_iris = Ivis(epochs=5, k=15, batch_size=16) y_pred_iris = ivis_iris.fit_transform(X_train, shuffle_mode='batch') y_pred = ivis_iris.transform(X_test) assert y_pred.shape[0] == len(X_test) assert y_pred.shape[1] == ivis_iris.embedding_dims
def test_score_samples(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) embeddings = ivis_iris.fit_transform(x, y) y_pred = ivis_iris.score_samples(x) # Softmax probabilities add to one, correct shape assert np.sum(y_pred, axis=-1) == pytest.approx(1, 0.01) assert y_pred.shape[0] == x.shape[0] assert y_pred.shape[1] == len(np.unique(y)) # Check that loss function and activation are correct assert ivis_iris.model_.loss['supervised'] == supervision_metric assert ivis_iris.model_.layers[-1].activation.__name__ == 'softmax'
def test_custom_loss_ivis(model_filepath): iris = datasets.load_iris() X = iris.data def euclidean_loss(y_true, y_pred): margin = 1 anchor, positive, negative = tf.unstack(y_pred) return K.mean( K.maximum( euclidean_distance(anchor, positive) - euclidean_distance(anchor, negative) + margin, 0)) model = Ivis(distance=euclidean_loss, k=15, batch_size=16, epochs=3) y_pred = model.fit_transform(X) # Test model saving and loading model.save_model(model_filepath, overwrite=True) model_2 = Ivis(distance=euclidean_loss) model_2.load_model(model_filepath) model_3 = Ivis() with pytest.raises(ValueError): model_3.load_model(model_filepath)
def test_svm_score_samples(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'categorical_hinge' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) # Correctly formatted one-hot labels train successfully y = to_categorical(y) embeddings = ivis_iris.fit_transform(x, y) y_pred = ivis_iris.score_samples(x) loss_name = ivis_iris.model_.loss['supervised'].__name__ assert losses.get(loss_name).__name__ == losses.get( supervision_metric).__name__ assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear' assert ivis_iris.model_.layers[-1].kernel_regularizer is not None assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
def ivis(adata, model=None, use_rep=None, n_pcs=None, embedding_dims=2, k=150, distance='pn', batch_size=128, epochs=1000, n_epochs_without_progress=50, margin=1, ntrees=50, search_k=-1, precompute=True, copy=False): """\ ivis Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. {doc_n_pcs} {use_rep} embedding_dims : int, optional (default: 2) Number of dimensions in the embedding space k : int, optional (default: 150) The number of neighbours to retrieve for each point. Must be less than one minus the number of rows in the dataset. distance : string, optional (default: "pn") The loss function used to train the neural network. One of "pn", "euclidean", "softmax_ratio_pn", "softmax_ratio". batch_size : int, optional (default: 128) The size of mini-batches used during gradient descent while training the neural network. Must be less than the num_rows in the dataset. epochs : int, optional (default: 1000) The maximum number of epochs to train the model for. Each epoch the network will see a triplet based on each data-point once. n_epochs_without_progress : int, optional (default: 50) After n number of epochs without an improvement to the loss, terminate training early. margin : float, optional (default: 1) The distance that is enforced between points by the triplet loss functions ntrees : int, optional (default: 50) The number of random projections trees built by Annoy to approximate KNN. The more trees the higher the memory usage, but the better the accuracy of results. search_k : int, optional (default: -1) The maximum number of nodes inspected during a nearest neighbour query by Annoy. The higher, the more computation time required, but the higher the accuracy. The default is n_trees * k, where k is the number of neighbours to retrieve. If this is set too low, a variable number of neighbours may be retrieved per data-point. precompute : boolean, optional (default: True) Whether to pre-compute the nearest neighbours. Pre-computing is significantly faster, but requires more memory. If memory is limited, try setting this to False. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. X_ivis : `np.ndarray` (`adata.obs`, dtype `float`) IVIS coordinates of data. """ logg.info('computing IVIS', r=True) adata = adata.copy() if copy else adata X = choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs) params_ivis = { 'model': model, 'embedding_dims': embedding_dims, 'k': k, 'distance': distance, 'batch_size': batch_size, 'epochs': epochs, 'n_epochs_without_progress': n_epochs_without_progress, 'margin': margin, 'ntrees': ntrees, 'search_k': search_k, 'precompute': precompute } from ivis import Ivis ivis_model = Ivis(**params_ivis) X_ivis = ivis_model.fit_transform(X) adata.obsm['X_ivis'] = X_ivis logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint('added\n' ' \'X_ivis\', IVIS coordinates (adata.obsm)') return adata if copy else None
import pandas as pd import umap from ivis import Ivis import numpy as np model = Ivis(embedding_dims=2, k=15) embeddings = model.fit_transform(X_scaled) # dimension reduction clusterable_embedding = umap.UMAP( n_neighbors=30, min_dist=0.0, n_components=2, random_state=42, ).fit_transform(mnist.data) # plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], # c=mnist.target, s=0.1, cmap='Spectral'); # cluster labels = hdbscan.HDBSCAN( min_samples=10, min_cluster_size=500, ).fit_predict(clusterable_embedding) # visualize clustered = (labels >= 0) plt.scatter(standard_embedding[~clustered, 0], standard_embedding[~clustered, 1], c=(0.5, 0.5, 0.5), s=0.1,
Ivis can be easily applied to unstructured datasets, including images. Here we visualise the MNSIT digits dataset using two-dimensional ivis embeddings. """ import os import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml from ivis import Ivis mnist = fetch_openml('mnist_784', version=1) ivis = Ivis(model='maaten', verbose=0) embeddings = ivis.fit_transform(mnist.data) color = mnist.target.astype(int) plt.figure(figsize=(8, 8), dpi=150) plt.scatter(x=embeddings[:, 0], y=embeddings[:, 1], c=color, cmap="Spectral", s=0.1) plt.xlabel('ivis 1') plt.ylabel('ivis 2') plt.show() os.remove('annoy.index')