Exemple #1
0
def get_graph_elements(graph_, n_epochs):
    ### should we remove redundancies () here??
    # graph_ = remove_redundant_edges(graph_)

    graph = graph_.tocoo()
    # eliminate duplicate entries by summing them together
    graph.sum_duplicates()
    # number of vertices in dataset
    n_vertices = graph.shape[1]
    # get the number of epochs based on the size of the dataset
    if n_epochs is None:
        # For smaller datasets we can use more epochs
        if graph.shape[0] <= 10000:
            n_epochs = 500
        else:
            n_epochs = 200
    # remove elements with very low probability
    graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0
    graph.eliminate_zeros()
    # get epochs per sample based upon edge probability
    epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs)

    head = graph.row
    tail = graph.col
    weight = graph.data

    return graph, epochs_per_sample, head, tail, weight, n_vertices
Exemple #2
0
def get_graph_elements(graph_, n_epochs):
    """
    gets elements of graphs, weights, and number of epochs per edge

    Parameters
    ----------
    graph_ : scipy.sparse.csr.csr_matrix
        umap graph of probabilities
    n_epochs : int
        maximum number of epochs per edge

    Returns
    -------
    graph scipy.sparse.csr.csr_matrix
        umap graph
    epochs_per_sample np.array
        number of epochs to train each sample for
    head np.array
        edge head
    tail np.array
        edge tail
    weight np.array
        edge weight
    n_vertices int
        number of verticies in graph
    """
    ### should we remove redundancies () here??
    # graph_ = remove_redundant_edges(graph_)

    graph = graph_.tocoo()
    # eliminate duplicate entries by summing them together
    graph.sum_duplicates()
    # number of vertices in dataset
    n_vertices = graph.shape[1]
    # get the number of epochs based on the size of the dataset
    if n_epochs is None:
        # For smaller datasets we can use more epochs
        if graph.shape[0] <= 10000:
            n_epochs = 500
        else:
            n_epochs = 200
    # remove elements with very low probability
    graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0
    graph.eliminate_zeros()
    # get epochs per sample based upon edge probability
    epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs)

    head = graph.row
    tail = graph.col
    weight = graph.data

    return graph, epochs_per_sample, head, tail, weight, n_vertices
Exemple #3
0
def simplicial_set_embedding(g, embedding, n_epochs, a, b, random_seed, gamma,
                             initial_alpha, negative_sample_rate, parallel,
                             nthreads):
    import numba
    from threadpoolctl import threadpool_limits
    from umap.umap_ import make_epochs_per_sample
    from umap.layouts import optimize_layout_euclidean
    from sklearn.utils import check_random_state
    import numpy as np

    g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0
    g.eliminate_zeros()
    epochs_per_sample = make_epochs_per_sample(g.data, n_epochs)
    head = g.row
    tail = g.col
    rng_state = check_random_state(random_seed).randint(
        np.iinfo(np.int32).min + 1,
        np.iinfo(np.int32).max - 1, 3).astype(np.int64)
    # Since threadpool_limits doesnt work well with numba. We will use numba's set_num_threads to limit threads
    if numba.config.NUMBA_NUM_THREADS > nthreads:
        numba.set_num_threads(nthreads)
    with threadpool_limits(limits=nthreads):
        embedding = optimize_layout_euclidean(embedding,
                                              embedding,
                                              head,
                                              tail,
                                              n_epochs,
                                              g.shape[1],
                                              epochs_per_sample,
                                              a,
                                              b,
                                              rng_state,
                                              gamma,
                                              initial_alpha,
                                              negative_sample_rate,
                                              parallel=parallel,
                                              verbose=True)
    return embedding
Exemple #4
0
    def transform(self):
        self.graph_ = self.data

        epochs_per_sample = umaplib.make_epochs_per_sample(
            self.graph_.data, self.n_iter)

        self.graph_ = self.graph_.tocoo()
        n_vertices = self.graph_.shape[1]
        head = self.graph_.row
        tail = self.graph_.col

        saver = TSNESaveEmbedding(self.outdir)
        rng_state = self.random_state.randint(-(2**31) + 1, 2**31 - 1,
                                              3).astype(np.int64)

        self.data_ = optimize_layout_euclidean(
            self.init,
            self.init,
            head,
            tail,
            self.n_iter,
            n_vertices,
            epochs_per_sample,
            self.a,
            self.b,
            rng_state,
            gamma=float(self.gamma),
            initial_alpha=self.learning_rate,
            eps=self.eps,
            negative_sample_rate=self.nu,
            parallel=self.parallel,
            verbose=False,
            saver=saver,
            save_freq=self.save_freq,
        )
        return self.data_
Exemple #5
0
    def update(self, X, y=None, **fit_params):
        if "relations" not in fit_params:
            raise ValueError(
                "Aligned UMAP requires relations between data to be "
                "specified")

        new_dict_relations = fit_params["relations"]
        X = check_array(X)

        self.__dict__ = set_aligned_params(fit_params, self.__dict__,
                                           self.n_models_)
        self.n_models_ += 1

        new_mapper = UMAP(
            n_neighbors=get_nth_item_or_val(self.n_neighbors, self.n_models_),
            min_dist=get_nth_item_or_val(self.min_dist, self.n_models_),
            n_epochs=get_nth_item_or_val(self.n_epochs, self.n_models_),
            repulsion_strength=get_nth_item_or_val(self.repulsion_strength,
                                                   self.n_models_),
            learning_rate=get_nth_item_or_val(self.learning_rate,
                                              self.n_models_),
            spread=get_nth_item_or_val(self.spread, self.n_models_),
            negative_sample_rate=get_nth_item_or_val(self.negative_sample_rate,
                                                     self.n_models_),
            local_connectivity=get_nth_item_or_val(self.local_connectivity,
                                                   self.n_models_),
            set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio,
                                                 self.n_models_),
            unique=get_nth_item_or_val(self.unique, self.n_models_),
        ).fit(X)

        self.mappers_ += [new_mapper]

        # TODO: We can likely make this more efficient and not recompute each time
        self.dict_relations_ += [invert_dict(new_dict_relations)]

        if self.n_epochs is None:
            n_epochs = 200
        else:
            n_epochs = self.n_epochs

        indptr_list = numba.typed.List.empty_list(numba.types.int32[::1])
        indices_list = numba.typed.List.empty_list(numba.types.int32[::1])
        heads = numba.typed.List.empty_list(numba.types.int32[::1])
        tails = numba.typed.List.empty_list(numba.types.int32[::1])
        epochs_per_samples = numba.typed.List.empty_list(
            numba.types.float64[::1])

        for i, mapper in enumerate(self.mappers_):
            indptr_list.append(mapper.graph_.indptr)
            indices_list.append(mapper.graph_.indices)
            heads.append(mapper.graph_.tocoo().row)
            tails.append(mapper.graph_.tocoo().col)
            if i == len(self.mappers_) - 1:
                epochs_per_samples.append(
                    make_epochs_per_sample(mapper.graph_.tocoo().data,
                                           n_epochs))
            else:
                epochs_per_samples.append(
                    np.full(mapper.embedding_.shape[0],
                            n_epochs + 1,
                            dtype=np.float64))

        new_relations = expand_relations(self.dict_relations_)
        new_regularisation_weights = build_neighborhood_similarities(
            indptr_list,
            indices_list,
            new_relations,
        )

        new_embedding = init_from_existing(self.embeddings_[-1],
                                           new_mapper.graph_,
                                           new_dict_relations)

        random_state = check_random_state(self.random_state)
        rng_state = random_state.randint(INT32_MIN, INT32_MAX,
                                         3).astype(np.int64)

        self.embeddings_.append(new_embedding)

        self.embeddings_ = optimize_layout_aligned_euclidean(
            self.embeddings_,
            self.embeddings_,
            heads,
            tails,
            n_epochs,
            epochs_per_samples,
            new_regularisation_weights,
            new_relations,
            rng_state,
            lambda_=self.alignment_regularisation,
        )
Exemple #6
0
    def fit(self, X, y=None, **fit_params):
        if "relations" not in fit_params:
            raise ValueError(
                "Aligned UMAP requires relations between data to be "
                "specified")

        self.dict_relations_ = fit_params["relations"]
        assert type(self.dict_relations_) in (list, tuple)
        assert type(X) in (list, tuple, np.ndarray)
        assert (len(X) - 1) == (len(self.dict_relations_))

        # We need n_components to be constant or this won't work
        if type(self.n_components) in (list, tuple, np.ndarray):
            raise ValueError(
                "n_components must be a single integer, and cannot vary")

        self.n_models_ = len(X)

        self.mappers_ = [
            UMAP(
                n_neighbors=get_nth_item_or_val(self.n_neighbors, n),
                min_dist=get_nth_item_or_val(self.min_dist, n),
                n_epochs=get_nth_item_or_val(self.n_epochs, n),
                repulsion_strength=get_nth_item_or_val(self.repulsion_strength,
                                                       n),
                learning_rate=get_nth_item_or_val(self.learning_rate, n),
                spread=get_nth_item_or_val(self.spread, n),
                negative_sample_rate=get_nth_item_or_val(
                    self.negative_sample_rate, n),
                local_connectivity=get_nth_item_or_val(self.local_connectivity,
                                                       n),
                set_op_mix_ratio=get_nth_item_or_val(self.set_op_mix_ratio, n),
                unique=get_nth_item_or_val(self.unique, n),
                n_components=self.n_components,
            ).fit(X[n]) for n in range(self.n_models_)
        ]

        if self.n_epochs is None:
            n_epochs = 200
        else:
            n_epochs = self.n_epochs

        window_size = fit_params.get("window_size", self.alignment_window_size)
        relations = expand_relations(self.dict_relations_, window_size)

        indptr_list = numba.typed.List.empty_list(numba.types.int32[::1])
        indices_list = numba.typed.List.empty_list(numba.types.int32[::1])
        heads = numba.typed.List.empty_list(numba.types.int32[::1])
        tails = numba.typed.List.empty_list(numba.types.int32[::1])
        epochs_per_samples = numba.typed.List.empty_list(
            numba.types.float64[::1])

        for mapper in self.mappers_:
            indptr_list.append(mapper.graph_.indptr)
            indices_list.append(mapper.graph_.indices)
            heads.append(mapper.graph_.tocoo().row)
            tails.append(mapper.graph_.tocoo().col)
            epochs_per_samples.append(
                make_epochs_per_sample(mapper.graph_.tocoo().data, n_epochs))

        regularisation_weights = build_neighborhood_similarities(
            indptr_list,
            indices_list,
            relations,
        )
        first_init = spectral_layout(
            self.mappers_[0]._raw_data,
            self.mappers_[0].graph_,
            self.n_components,
            np.random,
        )
        expansion = 10.0 / np.abs(first_init).max()
        first_embedding = (first_init * expansion).astype(
            np.float32,
            order="C",
        )

        embeddings = numba.typed.List.empty_list(numba.types.float32[:, ::1])
        embeddings.append(first_embedding)
        for i in range(1, self.n_models_):
            next_init = spectral_layout(
                self.mappers_[i]._raw_data,
                self.mappers_[i].graph_,
                self.n_components,
                np.random,
            )
            expansion = 10.0 / np.abs(next_init).max()
            next_embedding = (next_init * expansion).astype(
                np.float32,
                order="C",
            )
            anchor_data = relations[i][window_size - 1]
            left_anchors = anchor_data[anchor_data >= 0]
            right_anchors = np.where(anchor_data >= 0)[0]
            embeddings.append(
                procrustes_align(
                    embeddings[-1],
                    next_embedding,
                    np.vstack([left_anchors, right_anchors]),
                ))

        random_state = check_random_state(self.random_state)
        rng_state = random_state.randint(INT32_MIN, INT32_MAX,
                                         3).astype(np.int64)

        self.embeddings_ = optimize_layout_aligned_euclidean(
            embeddings,
            embeddings,
            heads,
            tails,
            n_epochs,
            epochs_per_samples,
            regularisation_weights,
            relations,
            rng_state,
            lambda_=self.alignment_regularisation,
        )

        return self
Exemple #7
0
def simplicial_set_embedding(
    g,
    embedding,
    n_epochs,
    a,
    b,
    random_seed,
    gamma,
    initial_alpha,
    negative_sample_rate,
    densmap_kwds,
    parallel,
    nthreads,
    verbose,
):
    import numba
    from threadpoolctl import threadpool_limits
    from umap.umap_ import make_epochs_per_sample
    from umap.layouts import optimize_layout_euclidean
    from sklearn.utils import check_random_state
    import numpy as np
    from .utils import tqdm_params

    # g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0
    # g.eliminate_zeros()
    epochs_per_sample = make_epochs_per_sample(g.data, n_epochs)
    logger.trace("calculated epochs_per_sample")
    rng_state = (check_random_state(random_seed).randint(
        np.iinfo(np.int32).min + 1,
        np.iinfo(np.int32).max - 1, 3).astype(np.int64))
    # Since threadpool_limits doesnt work well with numba. We will use numba's set_num_threads to limit threads
    if numba.config.NUMBA_NUM_THREADS > nthreads:
        numba.set_num_threads(nthreads)

    if densmap_kwds != {}:
        with threadpool_limits(limits=nthreads):
            mu_sum, R = calc_dens_map_params(g, densmap_kwds["knn_dists"])
        densmap_kwds["mu_sum"] = mu_sum
        densmap_kwds["R"] = R
        densmap_kwds["mu"] = g.data
        densmap = True
        logger.trace("calculated densmap params")
    else:
        densmap = False

    # tqdm will be activated if https://github.com/lmcinnes/umap/pull/739
    # is merged and when it is released
    tqdm_params = dict(tqdm_params)
    tqdm_params["desc"] = "Training UMAP"

    with threadpool_limits(limits=nthreads):
        embedding = optimize_layout_euclidean(
            head_embedding=embedding,
            tail_embedding=embedding,
            head=g.row,
            tail=g.col,
            n_epochs=n_epochs,
            n_vertices=g.shape[1],
            epochs_per_sample=epochs_per_sample,
            a=a,
            b=b,
            rng_state=rng_state,
            gamma=gamma,
            initial_alpha=initial_alpha,
            negative_sample_rate=negative_sample_rate,
            parallel=parallel,
            verbose=verbose,
            densmap=densmap,
            densmap_kwds=densmap_kwds,
            tqdm_kwds=tqdm_params,
            move_other=True,
        )
    return embedding