Esempio n. 1
0
class DFIvis(BaseEstimator, TransformerMixin):
    # NOTE:
    # - DFIvis(embedding_dims=df.shape[1]) to remain every dimensions
    def __init__(self, columns=None, prefix='ivis_', **kwargs):
        self.columns = columns
        self.prefix = prefix
        self.model = Ivis(**kwargs)
        self.transform_cols = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols].values,
                       y.values if y is not None else y)

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = pd.DataFrame(
            self.model.transform(X[self.transform_cols].values),
            columns=[
                f'{self.prefix}{x}' for x in range(self.model.embedding_dims)
            ])

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
Esempio n. 2
0
class Ivis(Transformer):
    """
    This transformer scales all the vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]
    by means of Ivis algorithm. We're using the implementation found
    [here](https://github.com/beringresearch/ivis).

    Important:
        This language backend might require you to manually install extra dependencies
        unless you installed via either;

        ```
        pip install whatlies[ivis]
        pip install whatlies[all]
        ```

    Arguments:
        n_components: the number of compoments to create/add
        kwargs: keyword arguments passed to the [Ivis implementation](https://bering-ivis.readthedocs.io/en/latest/hyperparameters.html)

    Usage:

    ```python
    from whatlies.language import GensimLanguage
    from whatlies.transformers import Ivis

    words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman",
             "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire",
             "dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water",
             "person", "family", "brother", "sister"]

    lang = GensimLanguage("wordvectors.kv")
    emb = lang[words]
    emb.transform(Ivis(3)).plot_interactive_matrix('ivis_0', 'ivis_1', 'ivis_2')
    ```
    """
    def __init__(self, n_components=2, **kwargs):
        super().__init__()
        self.n_components = n_components
        self.kwargs = kwargs
        self.kwargs["verbose"] = 0
        self.tfm = IVIS(embedding_dims=self.n_components, **self.kwargs)

    def fit(self, embset):
        names, X = embset.to_names_X()
        self.tfm.fit(X)
        self.is_fitted = True
        return self

    def transform(self, embset):
        names, X = embset.to_names_X()
        new_vecs = self.tfm.transform(X)
        names_out = names + [f"ivis_{i}" for i in range(self.n_components)]
        vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
        new_dict = new_embedding_dict(names_out, vectors_out, embset)
        return EmbeddingSet(new_dict,
                            name=f"{embset.name}.ivis_{self.n_components}()")
Esempio n. 3
0
def test_1d_supervied_iris_embedding():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    ivis_iris = Ivis(epochs=2, embedding_dims=1)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x, y)
Esempio n. 4
0
def test_iris_embedding():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    ivis_iris = Ivis(n_epochs_without_progress=5)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x)
Esempio n. 5
0
def test_score_samples_unsupervised():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    ivis_iris = Ivis(k=15, batch_size=16, epochs=2)
    embeddings = ivis_iris.fit_transform(x)

    # Unsupervised model cannot classify
    with pytest.raises(Exception):
        y_pred = ivis_iris.score_samples(x)
Esempio n. 6
0
def test_embeddings_image(log_dir):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[TensorBoardEmbeddingsImage(X, Y, log_dir, epoch_interval=1)])    

    model.fit_transform(X)
    assert os.path.exists(os.path.join(log_dir, 'embeddings'))
Esempio n. 7
0
def test_embeddings_logging(log_dir):
    iris = datasets.load_iris()
    X = iris.data

    filename = 'embeddings_{}.npy'
    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[EmbeddingsLogging(X, log_dir, filename, epoch_interval=1)])

    y_pred = model.fit_transform(X)
    embeddings = np.load(os.path.join(log_dir, filename.format(n_epochs)))
Esempio n. 8
0
def test_multidimensional_inputs():
    sample_data = np.ones(shape=(32, 8, 8, 3))

    base_model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(4, 3, input_shape=(8, 8, 3)),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.GlobalAveragePooling2D()
    ])

    model = Ivis(model=base_model, epochs=5, k=4, batch_size=4)
    y_pred = model.fit_transform(sample_data)
Esempio n. 9
0
def test_correctly_indexed_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    embeddings = ivis_iris.fit_transform(x, y)
Esempio n. 10
0
def test_iris_embedding():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target
    mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False)
    y[mask] = -1

    ivis_iris = Ivis(epochs=5)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x, y)
Esempio n. 11
0
def test_embeddings_image(log_dir):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    filename = 'embeddings_{}.png'
    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[EmbeddingsImage(X, Y, log_dir, filename, epoch_interval=1)])    

    model.fit_transform(X)
    assert os.path.exists(os.path.join(log_dir, filename.format(n_epochs)))
Esempio n. 12
0
def test_custom_ndarray_neighbour_matrix():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    class_indicies = {label: np.argwhere(y == label).ravel() for label in np.unique(y)}
    neighbour_matrix = np.array([class_indicies[label] for label in y])

    ivis_iris = Ivis(epochs=5, neighbour_matrix=neighbour_matrix)
    ivis_iris.k = 15
    ivis_iris.batch_size = 16

    y_pred_iris = ivis_iris.fit_transform(x)
Esempio n. 13
0
def test_invalid_metric():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'invalid_loss_function'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    # Loss function not specified
    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)
Esempio n. 14
0
def test_save_overwriting(model_filepath):
    model = Ivis(k=15, batch_size=16, epochs=2)
    iris = datasets.load_iris()
    X = iris.data

    model.fit(X)
    model.save_model(model_filepath)

    # Check that trying to save over an existing folder raises an Exception
    with pytest.raises(FileExistsError) as exception_info:
        model.save_model(model_filepath)
        assert isinstance(exception_info.value, FileExistsError)

    # Check that can overwrite existing model if requested
    model.save_model(model_filepath, overwrite=True)
Esempio n. 15
0
def _unsupervised_model_save_test(model_filepath, save_fn, load_fn):
    model = Ivis(k=15, batch_size=16, epochs=2)
    iris = datasets.load_iris()
    X = iris.data

    model.fit(X)
    save_fn(model, model_filepath)
    model_2 = load_fn(model_filepath)

    # Check that model predictions are same
    assert np.all(model.transform(X) == model_2.transform(X))
    _validate_network_equality(model, model_2)

    # Train new model
    y_pred_2 = model_2.fit_transform(X)
Esempio n. 16
0
def test_model_checkpoint(log_dir):
    iris = datasets.load_iris()
    X = iris.data

    filename = 'model-checkpoint_{}.ivis'
    n_epochs = 2
    model = Ivis(epochs=n_epochs, k=15, batch_size=16,
                 callbacks=[ModelCheckpoint(log_dir, filename, epoch_interval=1)])

    model.fit_transform(X)
    model_2 = Ivis()
    model_2.load_model(os.path.join(log_dir, filename.format(n_epochs)))

    # Test continuing training
    model_2.fit_transform(X)
Esempio n. 17
0
def test_non_consecutive_indexed_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    # Make labels non-consecutive indexed
    y[y == max(y)] = max(y) + 1

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)
Esempio n. 18
0
def test_correctly_indexed_semi_supervised_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    # Mark points as unlabeled
    mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False)
    y[mask] = -1

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    embeddings = ivis_iris.fit_transform(x, y)
class IvisWrapper():
    def __init__(self, dims):
        self.k = dims

    def fit_transform(self, data):
        self.model = Ivis(embedding_dims=self.k, k=8)
        x = self.model.fit_transform(data)
        os.remove(self.model.annoy_index_path)  # necessary cleanup
        return x
Esempio n. 20
0
    def get_model(self):
        if self.layout == 'umap':
            return UMAP(
                n_components=self.params.get('n_components'),
                verbose=self.params.get('verbose'),
                n_neighbors=self.params.get('umap_n_neighbors'),
                min_dist=self.params.get('umap_min_dist'),
            )
        elif self.layout == 'tsne':
            if multicore_tsne:
                return MulticoreTSNE()
            return TSNE(
                n_components=self.params.get('n_components'),
                verbose=self.params.get('verbose'),
            )
        elif self.layout == 'ivis':
            return Ivis(
                model=self.params.get('ivis_model'),
                embedding_dims=self.params.get('n_components'),
                k=self.params.get('ivis_k'),
                verbose=self.params.get('verbose'),
                n_epochs_without_progress=10,
            )
        elif self.layout == 'grid':
            # monkeypatch fit_transform method into rasterfairy for consistent api
            def fit_transform(X):
                return rasterfairy.transformPointCloud2D(X[:, :2])[0]

            clf = rasterfairy
            setattr(clf, 'fit_transform', fit_transform)
            return clf
        elif self.layout == 'img':

            class ImgLayout:
                def __init__(self, img_path):
                    self.img_path = img_path

                def fit_transform(self, X):
                    verts = ImgParser(self.img_path).get_n_vertices(X.shape[0])
                    # reorder vertices to get row major distribution
                    verts = np.array([verts[:, 1], 1 - verts[:, 0]]).T
                    return verts

            return ImgLayout(self.params.get('img_file'))
        elif self.layout == 'obj':

            class ObjLayout:
                def __init__(self, obj_path):
                    self.obj_path = obj_path

                def fit_transform(self, X):
                    return ObjParser(self.obj_path).get_n_vertices(X.shape[0])

            return ObjLayout(self.params.get('obj_file'))
        else:
            print(' ! Received request for unsupported layout model',
                  self.layout)
Esempio n. 21
0
def test_h5_file(h5_filepath):
    rows, dims = 258, 32
    create_random_dataset(h5_filepath, rows, dims)

    # Load data
    with h5py.File(h5_filepath, 'r') as f:
        X_train = f['data']
        y_train = f['labels']

        # Train and transform with ivis
        model = Ivis(epochs=5,
                     k=15,
                     batch_size=16,
                     precompute=False,
                     build_index_on_disk=False)
        y_pred = model.fit_transform(X_train, shuffle_mode='batch')

        assert y_pred.shape[0] == len(X_train)
        assert y_pred.shape[1] == model.embedding_dims
    def _reduce_dims(self, arg):
        """
        Uses ivis to reduce dimensionality to 2.

        :param {Iterable} arg - an array-like object
        :return {np.ndarray} embedded object
        """
        m = arg.shape[0]
        if m > 200:
            k = int(0.01 * m)
        elif m > 50:
            k = int(0.1 * m)
        elif m > 10:
            k = int(0.2 * m)
        else:
            k = max(int(0.4 * m), m - 3)

        ivis = Ivis(embedding_dims=self.embedding_dims, k=k, batch_size=2)
        return ivis.fit_transform(arg)
Esempio n. 23
0
def ivis_reduce(docvecs, label, ivis_model, use_nn, **kwargs):
    if use_nn:
        if not ivis_model:
            print(f"Train ivis...")
            ivis_model = Ivis(embedding_dims=1, k=15, model="maaten",
                              n_epochs_without_progress=15, verbose=0,
                              batch_size=128)
            if -1 in label.unique() and label.value_counts()[-1] == label.shape[0]:
                print("No labeled data found.")
                ivis_model = ivis_model.fit(docvecs)
            else:
                ivis_model = ivis_model.fit(
                    docvecs, Y=label.to_numpy())

        dim_reduced_vecs = ivis_model.transform(docvecs)
        decision_scores = dim_reduced_vecs.astype(float)
        return decision_scores, ivis_model
    else:
        return docvecs, None
Esempio n. 24
0
def test_non_consecutive_indexed_semi_supervised_classificaton_classes():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    # Make labels non-consecutive indexed
    y[y == max(y)] = max(y) + 1

    # Mark points as unlabeled
    mask = np.random.choice(range(len(y)), size=len(y) // 2, replace=False)
    y[mask] = -1

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)
Esempio n. 25
0
def test_h5_file(h5_filepath):
    rows, dims = 258, 32
    create_random_dataset(h5_filepath, rows, dims)

    # Load data
    test_index = rows // 5
    X_train = HDF5Matrix(h5_filepath, 'data', start=0, end=test_index)
    y_train = HDF5Matrix(h5_filepath, 'labels', start=0, end=test_index)

    X_test = HDF5Matrix(h5_filepath, 'data', start=test_index, end=rows)
    y_test = HDF5Matrix(h5_filepath, 'labels', start=test_index, end=rows)

    # Train and transform with ivis
    ivis_iris = Ivis(epochs=5, k=15, batch_size=16)

    y_pred_iris = ivis_iris.fit_transform(X_train, shuffle_mode='batch')
    y_pred = ivis_iris.transform(X_test)

    assert y_pred.shape[0] == len(X_test)
    assert y_pred.shape[1] == ivis_iris.embedding_dims
Esempio n. 26
0
def _custom_model_saving(model_filepath, save_fn, load_fn):
    iris = datasets.load_iris()
    X = iris.data
    Y = iris.target

    # Create a custom model
    inputs = tf.keras.layers.Input(shape=(X.shape[-1], ))
    x = tf.keras.layers.Dense(8, activation='relu')(inputs)
    custom_model = tf.keras.Model(inputs, x)

    model = Ivis(k=15, batch_size=16, epochs=2, model=custom_model)

    model.fit(X, Y)
    save_fn(model, model_filepath)
    model_2 = load_fn(model_filepath)

    # Check that model embeddings are same
    assert np.all(model.transform(X) == model_2.transform(X))
    # Check that model supervised predictions are same
    assert np.all(model.score_samples(X) == model_2.score_samples(X))

    _validate_network_equality(model, model_2)

    # Train new model
    y_pred_2 = model_2.fit_transform(X, Y)
Esempio n. 27
0
def test_score_samples():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'sparse_categorical_crossentropy'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    embeddings = ivis_iris.fit_transform(x, y)
    y_pred = ivis_iris.score_samples(x)

    # Softmax probabilities add to one, correct shape
    assert np.sum(y_pred, axis=-1) == pytest.approx(1, 0.01)
    assert y_pred.shape[0] == x.shape[0]
    assert y_pred.shape[1] == len(np.unique(y))

    # Check that loss function and activation are correct
    assert ivis_iris.model_.loss['supervised'] == supervision_metric
    assert ivis_iris.model_.layers[-1].activation.__name__ == 'softmax'
Esempio n. 28
0
def test_svm_score_samples():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'categorical_hinge'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=2,
                     supervision_metric=supervision_metric)

    # Correctly formatted one-hot labels train successfully
    y = to_categorical(y)
    embeddings = ivis_iris.fit_transform(x, y)

    y_pred = ivis_iris.score_samples(x)

    loss_name = ivis_iris.model_.loss['supervised'].__name__
    assert losses.get(loss_name).__name__ == losses.get(
        supervision_metric).__name__
    assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear'
    assert ivis_iris.model_.layers[-1].kernel_regularizer is not None
    assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
Esempio n. 29
0
def test_svm_score_samples():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    supervision_metric = 'categorical_hinge'
    ivis_iris = Ivis(k=15,
                     batch_size=16,
                     epochs=5,
                     supervision_metric=supervision_metric)

    # Incorrectly formatted labels from SVM
    with pytest.raises(ValueError):
        embeddings = ivis_iris.fit_transform(x, y)

    # Correctly formatted labels train successfully
    y = to_categorical(y) * 2 - 1
    embeddings = ivis_iris.fit_transform(x, y)

    y_pred = ivis_iris.score_samples(x)
    assert ivis_iris.model_.loss['supervised'] == supervision_metric
    assert ivis_iris.model_.layers[-1].activation.__name__ == 'linear'
    assert ivis_iris.model_.layers[-1].kernel_regularizer is not None
    assert ivis_iris.model_.layers[-1].output_shape[-1] == y.shape[-1]
Esempio n. 30
0
    def on_epoch_begin(self, model):
        print(
            f"\n----------------\n\nEnd of epoch {self.epoch}. Getting scores..."
        )
        scores = defaultdict(list)
        scores["epoch"] = self.epoch
        for df, seed in test_data:
            print(f"Vectorize...")

            docvecs = df["text"].progress_apply(lambda x: simple_preprocess(x))
            docvecs = docvecs.progress_apply(lambda x: model.infer_vector(x))

            print(f"Reduce dimensions...")
            dim_reducer = UMAP(metric="cosine",
                               set_op_mix_ratio=1.0,
                               n_components=256,
                               random_state=42)

            dim_reduced_vecs = dim_reducer.fit_transform(list(docvecs))

            print(f"Run ivis...")
            dim_reducer = Ivis(embedding_dims=1,
                               k=15,
                               model="maaten",
                               n_epochs_without_progress=10,
                               verbose=0)
            decision_scores = dim_reducer.fit_transform(dim_reduced_vecs)
            decision_scores = decision_scores.astype(float)

            print(f"Get and save scores...")
            preds = reject_outliers(decision_scores,
                                    iq_range=1.0 - contamination)
            preds = [-1 if x else 1 for x in preds]

            scores = get_scores(scores, df["outlier_label"], preds)
            scores["seed"] = seed
            print(
                f"Scores for epoch {self.epoch} | seed - {seed}:\n{pd.DataFrame(scores, index=[0])}"
            )

            self.result_df = self.result_df.append(scores, ignore_index=True)
            self.result_df.to_csv(self.log_path, sep="\t")
        self.epoch += 1