class DFIvis(BaseEstimator, TransformerMixin): # NOTE: # - DFIvis(embedding_dims=df.shape[1]) to remain every dimensions def __init__(self, columns=None, prefix='ivis_', **kwargs): self.columns = columns self.prefix = prefix self.model = Ivis(**kwargs) self.transform_cols = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols].values, y.values if y is not None else y) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = pd.DataFrame( self.model.transform(X[self.transform_cols].values), columns=[ f'{self.prefix}{x}' for x in range(self.model.embedding_dims) ]) return new_X def fit_transform(self, X, y=None): return self.fit(X, y).transform(X)
class Ivis(Transformer): """ This transformer scales all the vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] by means of Ivis algorithm. We're using the implementation found [here](https://github.com/beringresearch/ivis). Important: This language backend might require you to manually install extra dependencies unless you installed via either; ``` pip install whatlies[ivis] pip install whatlies[all] ``` Arguments: n_components: the number of compoments to create/add kwargs: keyword arguments passed to the [Ivis implementation](https://bering-ivis.readthedocs.io/en/latest/hyperparameters.html) Usage: ```python from whatlies.language import GensimLanguage from whatlies.transformers import Ivis words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water", "person", "family", "brother", "sister"] lang = GensimLanguage("wordvectors.kv") emb = lang[words] emb.transform(Ivis(3)).plot_interactive_matrix('ivis_0', 'ivis_1', 'ivis_2') ``` """ def __init__(self, n_components=2, **kwargs): super().__init__() self.n_components = n_components self.kwargs = kwargs self.kwargs["verbose"] = 0 self.tfm = IVIS(embedding_dims=self.n_components, **self.kwargs) def fit(self, embset): names, X = embset.to_names_X() self.tfm.fit(X) self.is_fitted = True return self def transform(self, embset): names, X = embset.to_names_X() new_vecs = self.tfm.transform(X) names_out = names + [f"ivis_{i}" for i in range(self.n_components)] vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)]) new_dict = new_embedding_dict(names_out, vectors_out, embset) return EmbeddingSet(new_dict, name=f"{embset.name}.ivis_{self.n_components}()")
def test_1d_supervied_iris_embedding(): iris = datasets.load_iris() x = iris.data y = iris.target ivis_iris = Ivis(epochs=2, embedding_dims=1) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x, y)
def test_iris_embedding(): iris = datasets.load_iris() x = iris.data y = iris.target ivis_iris = Ivis(n_epochs_without_progress=5) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x)
def test_score_samples_unsupervised(): iris = datasets.load_iris() x = iris.data y = iris.target ivis_iris = Ivis(k=15, batch_size=16, epochs=2) embeddings = ivis_iris.fit_transform(x) # Unsupervised model cannot classify with pytest.raises(Exception): y_pred = ivis_iris.score_samples(x)
def test_embeddings_image(log_dir): iris = datasets.load_iris() X = iris.data Y = iris.target n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[TensorBoardEmbeddingsImage(X, Y, log_dir, epoch_interval=1)]) model.fit_transform(X) assert os.path.exists(os.path.join(log_dir, 'embeddings'))
def test_embeddings_logging(log_dir): iris = datasets.load_iris() X = iris.data filename = 'embeddings_{}.npy' n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[EmbeddingsLogging(X, log_dir, filename, epoch_interval=1)]) y_pred = model.fit_transform(X) embeddings = np.load(os.path.join(log_dir, filename.format(n_epochs)))
def test_multidimensional_inputs(): sample_data = np.ones(shape=(32, 8, 8, 3)) base_model = tf.keras.models.Sequential([ tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 3)), tf.keras.layers.MaxPool2D(), tf.keras.layers.GlobalAveragePooling2D() ]) model = Ivis(model=base_model, epochs=5, k=4, batch_size=4) y_pred = model.fit_transform(sample_data)
def test_correctly_indexed_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) embeddings = ivis_iris.fit_transform(x, y)
def test_iris_embedding(): iris = datasets.load_iris() x = iris.data y = iris.target mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False) y[mask] = -1 ivis_iris = Ivis(epochs=5) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x, y)
def test_embeddings_image(log_dir): iris = datasets.load_iris() X = iris.data Y = iris.target filename = 'embeddings_{}.png' n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[EmbeddingsImage(X, Y, log_dir, filename, epoch_interval=1)]) model.fit_transform(X) assert os.path.exists(os.path.join(log_dir, filename.format(n_epochs)))
def test_custom_ndarray_neighbour_matrix(): iris = datasets.load_iris() x = iris.data y = iris.target class_indicies = {label: np.argwhere(y == label).ravel() for label in np.unique(y)} neighbour_matrix = np.array([class_indicies[label] for label in y]) ivis_iris = Ivis(epochs=5, neighbour_matrix=neighbour_matrix) ivis_iris.k = 15 ivis_iris.batch_size = 16 y_pred_iris = ivis_iris.fit_transform(x)
def test_invalid_metric(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'invalid_loss_function' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) # Loss function not specified with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y)
def test_save_overwriting(model_filepath): model = Ivis(k=15, batch_size=16, epochs=2) iris = datasets.load_iris() X = iris.data model.fit(X) model.save_model(model_filepath) # Check that trying to save over an existing folder raises an Exception with pytest.raises(FileExistsError) as exception_info: model.save_model(model_filepath) assert isinstance(exception_info.value, FileExistsError) # Check that can overwrite existing model if requested model.save_model(model_filepath, overwrite=True)
def _unsupervised_model_save_test(model_filepath, save_fn, load_fn): model = Ivis(k=15, batch_size=16, epochs=2) iris = datasets.load_iris() X = iris.data model.fit(X) save_fn(model, model_filepath) model_2 = load_fn(model_filepath) # Check that model predictions are same assert np.all(model.transform(X) == model_2.transform(X)) _validate_network_equality(model, model_2) # Train new model y_pred_2 = model_2.fit_transform(X)
def test_model_checkpoint(log_dir): iris = datasets.load_iris() X = iris.data filename = 'model-checkpoint_{}.ivis' n_epochs = 2 model = Ivis(epochs=n_epochs, k=15, batch_size=16, callbacks=[ModelCheckpoint(log_dir, filename, epoch_interval=1)]) model.fit_transform(X) model_2 = Ivis() model_2.load_model(os.path.join(log_dir, filename.format(n_epochs))) # Test continuing training model_2.fit_transform(X)
def test_non_consecutive_indexed_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target # Make labels non-consecutive indexed y[y == max(y)] = max(y) + 1 supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y)
def test_correctly_indexed_semi_supervised_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target # Mark points as unlabeled mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False) y[mask] = -1 supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) embeddings = ivis_iris.fit_transform(x, y)
class IvisWrapper(): def __init__(self, dims): self.k = dims def fit_transform(self, data): self.model = Ivis(embedding_dims=self.k, k=8) x = self.model.fit_transform(data) os.remove(self.model.annoy_index_path) # necessary cleanup return x
def get_model(self): if self.layout == 'umap': return UMAP( n_components=self.params.get('n_components'), verbose=self.params.get('verbose'), n_neighbors=self.params.get('umap_n_neighbors'), min_dist=self.params.get('umap_min_dist'), ) elif self.layout == 'tsne': if multicore_tsne: return MulticoreTSNE() return TSNE( n_components=self.params.get('n_components'), verbose=self.params.get('verbose'), ) elif self.layout == 'ivis': return Ivis( model=self.params.get('ivis_model'), embedding_dims=self.params.get('n_components'), k=self.params.get('ivis_k'), verbose=self.params.get('verbose'), n_epochs_without_progress=10, ) elif self.layout == 'grid': # monkeypatch fit_transform method into rasterfairy for consistent api def fit_transform(X): return rasterfairy.transformPointCloud2D(X[:, :2])[0] clf = rasterfairy setattr(clf, 'fit_transform', fit_transform) return clf elif self.layout == 'img': class ImgLayout: def __init__(self, img_path): self.img_path = img_path def fit_transform(self, X): verts = ImgParser(self.img_path).get_n_vertices(X.shape[0]) # reorder vertices to get row major distribution verts = np.array([verts[:, 1], 1 - verts[:, 0]]).T return verts return ImgLayout(self.params.get('img_file')) elif self.layout == 'obj': class ObjLayout: def __init__(self, obj_path): self.obj_path = obj_path def fit_transform(self, X): return ObjParser(self.obj_path).get_n_vertices(X.shape[0]) return ObjLayout(self.params.get('obj_file')) else: print(' ! Received request for unsupported layout model', self.layout)
def test_h5_file(h5_filepath): rows, dims = 258, 32 create_random_dataset(h5_filepath, rows, dims) # Load data with h5py.File(h5_filepath, 'r') as f: X_train = f['data'] y_train = f['labels'] # Train and transform with ivis model = Ivis(epochs=5, k=15, batch_size=16, precompute=False, build_index_on_disk=False) y_pred = model.fit_transform(X_train, shuffle_mode='batch') assert y_pred.shape[0] == len(X_train) assert y_pred.shape[1] == model.embedding_dims
def _reduce_dims(self, arg): """ Uses ivis to reduce dimensionality to 2. :param {Iterable} arg - an array-like object :return {np.ndarray} embedded object """ m = arg.shape[0] if m > 200: k = int(0.01 * m) elif m > 50: k = int(0.1 * m) elif m > 10: k = int(0.2 * m) else: k = max(int(0.4 * m), m - 3) ivis = Ivis(embedding_dims=self.embedding_dims, k=k, batch_size=2) return ivis.fit_transform(arg)
def ivis_reduce(docvecs, label, ivis_model, use_nn, **kwargs): if use_nn: if not ivis_model: print(f"Train ivis...") ivis_model = Ivis(embedding_dims=1, k=15, model="maaten", n_epochs_without_progress=15, verbose=0, batch_size=128) if -1 in label.unique() and label.value_counts()[-1] == label.shape[0]: print("No labeled data found.") ivis_model = ivis_model.fit(docvecs) else: ivis_model = ivis_model.fit( docvecs, Y=label.to_numpy()) dim_reduced_vecs = ivis_model.transform(docvecs) decision_scores = dim_reduced_vecs.astype(float) return decision_scores, ivis_model else: return docvecs, None
def test_non_consecutive_indexed_semi_supervised_classificaton_classes(): iris = datasets.load_iris() x = iris.data y = iris.target # Make labels non-consecutive indexed y[y == max(y)] = max(y) + 1 # Mark points as unlabeled mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False) y[mask] = -1 supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y)
def test_h5_file(h5_filepath): rows, dims = 258, 32 create_random_dataset(h5_filepath, rows, dims) # Load data test_index = rows // 5 X_train = HDF5Matrix(h5_filepath, 'data', start=0, end=test_index) y_train = HDF5Matrix(h5_filepath, 'labels', start=0, end=test_index) X_test = HDF5Matrix(h5_filepath, 'data', start=test_index, end=rows) y_test = HDF5Matrix(h5_filepath, 'labels', start=test_index, end=rows) # Train and transform with ivis ivis_iris = Ivis(epochs=5, k=15, batch_size=16) y_pred_iris = ivis_iris.fit_transform(X_train, shuffle_mode='batch') y_pred = ivis_iris.transform(X_test) assert y_pred.shape[0] == len(X_test) assert y_pred.shape[1] == ivis_iris.embedding_dims
def _custom_model_saving(model_filepath, save_fn, load_fn): iris = datasets.load_iris() X = iris.data Y = iris.target # Create a custom model inputs = tf.keras.layers.Input(shape=(X.shape[-1], )) x = tf.keras.layers.Dense(8, activation='relu')(inputs) custom_model = tf.keras.Model(inputs, x) model = Ivis(k=15, batch_size=16, epochs=2, model=custom_model) model.fit(X, Y) save_fn(model, model_filepath) model_2 = load_fn(model_filepath) # Check that model embeddings are same assert np.all(model.transform(X) == model_2.transform(X)) # Check that model supervised predictions are same assert np.all(model.score_samples(X) == model_2.score_samples(X)) _validate_network_equality(model, model_2) # Train new model y_pred_2 = model_2.fit_transform(X, Y)
def test_score_samples(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'sparse_categorical_crossentropy' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) embeddings = ivis_iris.fit_transform(x, y) y_pred = ivis_iris.score_samples(x) # Softmax probabilities add to one, correct shape assert np.sum(y_pred, axis=-1) == pytest.approx(1, 0.01) assert y_pred.shape[0] == x.shape[0] assert y_pred.shape[1] == len(np.unique(y)) # Check that loss function and activation are correct assert ivis_iris.model_.loss['supervised'] == supervision_metric assert ivis_iris.model_.layers[-1].activation.__name__ == 'softmax'
def test_svm_score_samples(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'categorical_hinge' ivis_iris = Ivis(k=15, batch_size=16, epochs=2, supervision_metric=supervision_metric) # Correctly formatted one-hot labels train successfully y = to_categorical(y) embeddings = ivis_iris.fit_transform(x, y) y_pred = ivis_iris.score_samples(x) loss_name = ivis_iris.model_.loss['supervised'].__name__ assert losses.get(loss_name).__name__ == losses.get( supervision_metric).__name__ assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear' assert ivis_iris.model_.layers[-1].kernel_regularizer is not None assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
def test_svm_score_samples(): iris = datasets.load_iris() x = iris.data y = iris.target supervision_metric = 'categorical_hinge' ivis_iris = Ivis(k=15, batch_size=16, epochs=5, supervision_metric=supervision_metric) # Incorrectly formatted labels from SVM with pytest.raises(ValueError): embeddings = ivis_iris.fit_transform(x, y) # Correctly formatted labels train successfully y = to_categorical(y) * 2 - 1 embeddings = ivis_iris.fit_transform(x, y) y_pred = ivis_iris.score_samples(x) assert ivis_iris.model_.loss['supervised'] == supervision_metric assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear' assert ivis_iris.model_.layers[-1].kernel_regularizer is not None assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
def on_epoch_begin(self, model): print( f"\n----------------\n\nEnd of epoch {self.epoch}. Getting scores..." ) scores = defaultdict(list) scores["epoch"] = self.epoch for df, seed in test_data: print(f"Vectorize...") docvecs = df["text"].progress_apply(lambda x: simple_preprocess(x)) docvecs = docvecs.progress_apply(lambda x: model.infer_vector(x)) print(f"Reduce dimensions...") dim_reducer = UMAP(metric="cosine", set_op_mix_ratio=1.0, n_components=256, random_state=42) dim_reduced_vecs = dim_reducer.fit_transform(list(docvecs)) print(f"Run ivis...") dim_reducer = Ivis(embedding_dims=1, k=15, model="maaten", n_epochs_without_progress=10, verbose=0) decision_scores = dim_reducer.fit_transform(dim_reduced_vecs) decision_scores = decision_scores.astype(float) print(f"Get and save scores...") preds = reject_outliers(decision_scores, iq_range=1.0 - contamination) preds = [-1 if x else 1 for x in preds] scores = get_scores(scores, df["outlier_label"], preds) scores["seed"] = seed print( f"Scores for epoch {self.epoch} | seed - {seed}:\n{pd.DataFrame(scores, index=[0])}" ) self.result_df = self.result_df.append(scores, ignore_index=True) self.result_df.to_csv(self.log_path, sep="\t") self.epoch += 1