コード例 #1
0
def test_hellinger(spatial_data):
    hellinger_data = np.abs(spatial_data[:-2].copy())
    hellinger_data = hellinger_data / hellinger_data.sum(axis=1)[:, np.newaxis]
    hellinger_data = np.sqrt(hellinger_data)
    dist_matrix = hellinger_data @ hellinger_data.T
    dist_matrix = 1.0 - dist_matrix
    dist_matrix = np.sqrt(dist_matrix)
    # Correct for nan handling
    dist_matrix[np.isnan(dist_matrix)] = 0.0

    test_matrix = dist.pairwise_special_metric(np.abs(spatial_data[:-2]))

    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match "
        "for metric hellinger",
    )

    # Ensure ll_dirichlet runs
    test_matrix = dist.pairwise_special_metric(np.abs(spatial_data[:-2]),
                                               metric="ll_dirichlet")
    assert (
        test_matrix
        is not None), "Pairwise Special Metric with LL Dirichlet metric failed"
コード例 #2
0
def test_sparse_hellinger(sparse_spatial_data):
    dist_matrix = dist.pairwise_special_metric(
        np.abs(sparse_spatial_data[:-2].toarray()))
    test_matrix = np.array([[
        spdist.sparse_hellinger(
            np.abs(sparse_spatial_data[i]).indices,
            np.abs(sparse_spatial_data[i]).data,
            np.abs(sparse_spatial_data[j]).indices,
            np.abs(sparse_spatial_data[j]).data,
        ) for j in range(sparse_spatial_data.shape[0] - 2)
    ] for i in range(sparse_spatial_data.shape[0] - 2)])

    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Sparse distances don't match "
        "for metric hellinger",
        decimal=4,
    )

    # Ensure ll_dirichlet runs
    test_matrix = np.array([[
        spdist.sparse_ll_dirichlet(
            sparse_spatial_data[i].indices,
            sparse_spatial_data[i].data,
            sparse_spatial_data[j].indices,
            sparse_spatial_data[j].data,
        ) for j in range(sparse_spatial_data.shape[0])
    ] for i in range(sparse_spatial_data.shape[0])])
    assert (
        test_matrix
        is not None), "Pairwise Special Metric with LL Dirichlet metric failed"
コード例 #3
0
def test_disconnected_data_precomputed(num_isolates, sparse):
    disconnected_data = np.random.choice(a=[False, True],
                                         size=(10, 20),
                                         p=[0.66, 1 - 0.66])
    # Add some disconnected data for the corner case test
    disconnected_data = np.vstack(
        [disconnected_data,
         np.zeros((num_isolates, 20), dtype="bool")])
    new_columns = np.zeros((num_isolates + 10, num_isolates), dtype="bool")
    for i in range(num_isolates):
        new_columns[10 + i, i] = True
    disconnected_data = np.hstack([disconnected_data, new_columns])
    dmat = pairwise_special_metric(disconnected_data)
    if sparse:
        dmat = csr_matrix(dmat)
    model = UMAP(n_neighbors=3, metric="precomputed",
                 disconnection_distance=1).fit(dmat)

    # Check that the first isolate has no edges in our umap.graph_
    isolated_vertices = disconnected_vertices(model)
    assert isolated_vertices[10] == True
    number_of_nan = np.sum(np.isnan(model.embedding_[isolated_vertices]))
    assert number_of_nan >= num_isolates * model.n_components
コード例 #4
0
ファイル: spectral.py プロジェクト: jmconroy/umap
def component_layout(
    data,
    n_components,
    component_labels,
    dim,
    random_state,
    metric="euclidean",
    metric_kwds={},
):
    """Provide a layout relating the separate connected components. This is done
    by taking the centroid of each component and then performing a spectral embedding
    of the centroids.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data -- required so we can generate centroids for each
        connected component of the graph.

    n_components: int
        The number of distinct components to be layed out.

    component_labels: array of shape (n_samples)
        For each vertex in the graph the label of the component to
        which the vertex belongs.

    dim: int
        The chosen embedding dimension.

    metric: string or callable (optional, default 'euclidean')
        The metric used to measure distances among the source data points.

    metric_kwds: dict (optional, default {})
        Keyword arguments to be passed to the metric function.
        If metric is 'precomputed', 'linkage' keyword can be used to specify
        'average', 'complete', or 'single' linkage. Default is 'average'

    Returns
    -------
    component_embedding: array of shape (n_components, dim)
        The ``dim``-dimensional embedding of the ``n_components``-many
        connected components.
    """

    component_centroids = np.empty((n_components, data.shape[1]),
                                   dtype=np.float64)

    if metric == "precomputed":
        # cannot compute centroids from precomputed distances
        # instead, compute centroid distances using linkage
        distance_matrix = np.zeros((n_components, n_components),
                                   dtype=np.float64)
        linkage = metric_kwds.get("linkage", "average")
        if linkage == "average":
            linkage = np.mean
        elif linkage == "complete":
            linkage = np.max
        elif linkage == "single":
            linkage = np.min
        else:
            raise ValueError("Unrecognized linkage '%s'. Please choose from "
                             "'average', 'complete', or 'single'" % linkage)
        for c_i in range(n_components):
            dm_i = data[component_labels == c_i]
            for c_j in range(c_i + 1, n_components):
                dist = linkage(dm_i[:, component_labels == c_j])
                distance_matrix[c_i, c_j] = dist
                distance_matrix[c_j, c_i] = dist
    else:
        for label in range(n_components):
            component_centroids[label] = data[component_labels == label].mean(
                axis=0)

        if scipy.sparse.isspmatrix(component_centroids):
            warn(
                "Forcing component centroids to dense; if you are running out of "
                "memory then consider increasing n_neighbors.")
            component_centroids = component_centroids.toarray()

        if metric in SPECIAL_METRICS:
            distance_matrix = pairwise_special_metric(component_centroids,
                                                      metric=metric)
        elif metric in SPARSE_SPECIAL_METRICS:
            distance_matrix = pairwise_special_metric(
                component_centroids, metric=SPARSE_SPECIAL_METRICS[metric])
        else:
            if callable(metric) and scipy.sparse.isspmatrix(data):
                function_to_name_mapping = {
                    v: k
                    for k, v in sparse_named_distances.items()
                }
                try:
                    metric_name = function_to_name_mapping[metric]
                except KeyError:
                    raise NotImplementedError(
                        "Multicomponent layout for custom "
                        "sparse metrics is not implemented at "
                        "this time.")
                distance_matrix = pairwise_distances(component_centroids,
                                                     metric=metric_name,
                                                     **metric_kwds)
            else:
                distance_matrix = pairwise_distances(component_centroids,
                                                     metric=metric,
                                                     **metric_kwds)

    affinity_matrix = np.exp(-(distance_matrix**2))

    component_embedding = SpectralEmbedding(
        n_components=dim, affinity="precomputed",
        random_state=random_state).fit_transform(affinity_matrix)
    component_embedding /= component_embedding.max()

    return component_embedding
コード例 #5
0
ファイル: test_umap.py プロジェクト: yz24/umap
def test_grad_metrics_match_metrics():
    for metric in dist.named_distances_with_gradients:
        if metric in spatial_distances:
            dist_matrix = pairwise_distances(spatial_data, metric=metric)
            # scipy is bad sometimes
            if metric == "braycurtis":
                dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
            if metric in ("cosine", "correlation"):
                dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
                # And because distance between all zero vectors should be zero
                dist_matrix[10, 11] = 0.0
                dist_matrix[11, 10] = 0.0

            dist_function = dist.named_distances_with_gradients[metric]
            test_matrix = np.array(
                [
                    [
                        dist_function(spatial_data[i], spatial_data[j])[0]
                        for j in range(spatial_data.shape[0])
                    ]
                    for i in range(spatial_data.shape[0])
                ]
            )
            assert_array_almost_equal(
                test_matrix,
                dist_matrix,
                err_msg="Distances with grad don't match "
                "for metric {}".format(metric),
            )

    # Handle the few special distances separately
    # SEuclidean
    v = np.abs(np.random.randn(spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v)
    test_matrix = np.array(
        [
            [
                dist.standardised_euclidean_grad(spatial_data[i], spatial_data[j], v)[0]
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric seuclidean",
    )

    # Weighted minkowski
    dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3)
    test_matrix = np.array(
        [
            [
                dist.weighted_minkowski_grad(spatial_data[i], spatial_data[j], v, p=3)[
                    0
                ]
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric weighted_minkowski",
    )
    # Mahalanobis
    v = np.abs(np.random.randn(spatial_data.shape[1], spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
    test_matrix = np.array(
        [
            [
                dist.mahalanobis_grad(spatial_data[i], spatial_data[j], v)[0]
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric mahalanobis",
    )

    # Hellinger
    dist_matrix = dist.pairwise_special_metric(
        np.abs(spatial_data[:-2]), np.abs(spatial_data[:-2])
    )
    test_matrix = np.array(
        [
            [
                dist.hellinger_grad(np.abs(spatial_data[i]), np.abs(spatial_data[j]))[0]
                for j in range(spatial_data.shape[0] - 2)
            ]
            for i in range(spatial_data.shape[0] - 2)
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric hellinger",
    )
コード例 #6
0
def component_layout(
    data,
    n_components,
    component_labels,
    dim,
    random_state,
    metric="euclidean",
    metric_kwds={},
):
    """Provide a layout relating the separate connected components. This is done
    by taking the centroid of each component and then performing a spectral embedding
    of the centroids.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data -- required so we can generate centroids for each
        connected component of the graph.

    n_components: int
        The number of distinct components to be layed out.

    component_labels: array of shape (n_samples)
        For each vertex in the graph the label of the component to
        which the vertex belongs.

    dim: int
        The chosen embedding dimension.

    metric: string or callable (optional, default 'euclidean')
        The metric used to measure distances among the source data points.

    metric_kwds: dict (optional, default {})
        Keyword arguments to be passed to the metric function.

    Returns
    -------
    component_embedding: array of shape (n_components, dim)
        The ``dim``-dimensional embedding of the ``n_components``-many
        connected components.
    """

    component_centroids = np.empty((n_components, data.shape[1]),
                                   dtype=np.float64)

    for label in range(n_components):
        component_centroids[label] = data[component_labels == label].mean(
            axis=0)

    if metric in ("hellinger", "ll_dirichlet"):
        distance_matrix = pairwise_special_metric(component_centroids,
                                                  metric=metric)
    else:
        distance_matrix = pairwise_distances(component_centroids,
                                             metric=metric,
                                             **metric_kwds)

    affinity_matrix = np.exp(-(distance_matrix**2))

    component_embedding = SpectralEmbedding(
        n_components=dim, affinity="precomputed",
        random_state=random_state).fit_transform(affinity_matrix)
    component_embedding /= component_embedding.max()

    return component_embedding
コード例 #7
0
def test_grad_metrics_match_metrics(spatial_data, spatial_distances):
    for metric in dist.named_distances_with_gradients:
        if metric in spatial_distances:
            spatial_check(metric,
                          spatial_data,
                          spatial_distances,
                          with_grad=True)

    # Handle the few special distances separately
    # SEuclidean
    v = np.abs(np.random.randn(spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v)
    test_matrix = np.array([[
        dist.standardised_euclidean_grad(spatial_data[i], spatial_data[j],
                                         v)[0]
        for j in range(spatial_data.shape[0])
    ] for i in range(spatial_data.shape[0])])
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match "
        "for metric seuclidean",
    )

    # Weighted minkowski
    dist_matrix = pairwise_distances(spatial_data,
                                     metric="wminkowski",
                                     w=v,
                                     p=3)
    test_matrix = np.array([[
        dist.weighted_minkowski_grad(spatial_data[i], spatial_data[j], v,
                                     p=3)[0]
        for j in range(spatial_data.shape[0])
    ] for i in range(spatial_data.shape[0])])
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match "
        "for metric weighted_minkowski",
    )

    # Mahalanobis
    v = np.abs(np.random.randn(spatial_data.shape[1], spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
    test_matrix = np.array([[
        dist.mahalanobis_grad(spatial_data[i], spatial_data[j], v)[0]
        for j in range(spatial_data.shape[0])
    ] for i in range(spatial_data.shape[0])])
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match "
        "for metric mahalanobis",
    )

    # Hellinger
    dist_matrix = dist.pairwise_special_metric(np.abs(spatial_data[:-2]),
                                               np.abs(spatial_data[:-2]))
    test_matrix = np.array([[
        dist.hellinger_grad(np.abs(spatial_data[i]),
                            np.abs(spatial_data[j]))[0]
        for j in range(spatial_data.shape[0] - 2)
    ] for i in range(spatial_data.shape[0] - 2)])
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match "
        "for metric hellinger",
    )
コード例 #8
0
    def fit(self, X, y=None):
        """Generate graph to fit X into an embedded space.
        Optionally use y for supervised dimension reduction.
        Parameters
        ----------
        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
            If the metric is 'precomputed' X must be a square distance
            matrix. Otherwise it contains a sample per row. If the method
            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
            or 'coo'.
        y : array, shape (n_samples)
            A target array for supervised dimension reduction. How this is
            handled is determined by parameters UMAP was instantiated with.
            The relevant attributes are ``target_metric`` and
            ``target_metric_kwds``.
        """

        X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C")
        self._raw_data = X

        # Handle all the optional arguments, setting default
        if self.a is None or self.b is None:
            self._a, self._b = find_ab_params(self.spread, self.min_dist)
        else:
            self._a = self.a
            self._b = self.b

        if isinstance(self.init, np.ndarray):
            init = check_array(self.init,
                               dtype=np.float32,
                               accept_sparse=False)
        else:
            init = self.init

        self._initial_alpha = self.learning_rate

        self._validate_parameters()

        if self.verbose:
            print(str(self))

        self._original_n_threads = numba.get_num_threads()
        if self.n_jobs > 0 and self.njobs is not None:
            numba.set_num_threads(self.n_jobs)

        # Check if we should unique the data
        # We've already ensured that we aren't in the precomputed case
        if self.unique:
            # check if the matrix is dense
            if self._sparse_data:
                # Call a sparse unique function
                index, inverse, counts = csr_unique(X)
            else:
                index, inverse, counts = np.unique(
                    X,
                    return_index=True,
                    return_inverse=True,
                    return_counts=True,
                    axis=0,
                )[1:4]
            if self.verbose:
                print(
                    "Unique=True -> Number of data points reduced from ",
                    X.shape[0],
                    " to ",
                    X[index].shape[0],
                )
                most_common = np.argmax(counts)
                print(
                    "Most common duplicate is",
                    index[most_common],
                    " with a count of ",
                    counts[most_common],
                )
        # If we aren't asking for unique use the full index.
        # This will save special cases later.
        else:
            index = list(range(X.shape[0]))
            inverse = list(range(X.shape[0]))

        # Error check n_neighbors based on data size
        if X[index].shape[0] <= self.n_neighbors:
            if X[index].shape[0] == 1:
                self.embedding_ = np.zeros(
                    (1, self.n_components))  # needed to sklearn comparability
                return self

            warn("n_neighbors is larger than the dataset size; truncating to "
                 "X.shape[0] - 1")
            self._n_neighbors = X[index].shape[0] - 1
            if self.densmap:
                self._densmap_kwds["n_neighbors"] = self._n_neighbors
        else:
            self._n_neighbors = self.n_neighbors

        # Note: unless it causes issues for setting 'index', could move this to
        # initial sparsity check above
        if self._sparse_data and not X.has_sorted_indices:
            X.sort_indices()

        random_state = check_random_state(self.random_state)

        if self.verbose:
            print("Construct fuzzy simplicial set")

        if self.metric == "precomputed" and self._sparse_data:
            # For sparse precomputed distance matrices, we just argsort the rows to find
            # nearest neighbors. To make this easier, we expect matrices that are
            # symmetrical (so we can find neighbors by looking at rows in isolation,
            # rather than also having to consider that sample's column too).
            print("Computing KNNs for sparse precomputed distances...")
            if sparse_tril(X).getnnz() != sparse_triu(X).getnnz():
                raise ValueError(
                    "Sparse precomputed distance matrices should be symmetrical!"
                )
            if not np.all(X.diagonal() == 0):
                raise ValueError(
                    "Non-zero distances from samples to themselves!")
            self._knn_indices = np.zeros((X.shape[0], self.n_neighbors),
                                         dtype=np.int)
            self._knn_dists = np.zeros(self._knn_indices.shape, dtype=np.float)
            for row_id in range(X.shape[0]):
                # Find KNNs row-by-row
                row_data = X[row_id].data
                row_indices = X[row_id].indices
                if len(row_data) < self._n_neighbors:
                    raise ValueError(
                        "Some rows contain fewer than n_neighbors distances!")
                row_nn_data_indices = np.argsort(row_data)[:self._n_neighbors]
                self._knn_indices[row_id] = row_indices[row_nn_data_indices]
                self._knn_dists[row_id] = row_data[row_nn_data_indices]
            (
                self.graph_,
                self._sigmas,
                self._rhos,
                self.graph_dists_,
            ) = fuzzy_simplicial_set(
                X[index],
                self.n_neighbors,
                random_state,
                "precomputed",
                self._metric_kwds,
                self._knn_indices,
                self._knn_dists,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                self.densmap or self.output_dens,
            )
        # Handle small cases efficiently by computing all distances
        elif X[index].shape[
                0] < 4096 and not self.force_approximation_algorithm:
            self._small_data = True
            try:
                # sklearn pairwise_distances fails for callable metric on sparse data
                _m = self.metric if self._sparse_data else self._input_distance_func
                dmat = pairwise_distances(X[index],
                                          metric=_m,
                                          **self._metric_kwds)
            except (ValueError, TypeError) as e:
                # metric is numba.jit'd or not supported by sklearn,
                # fallback to pairwise special

                if self._sparse_data:
                    # Get a fresh metric since we are casting to dense
                    if not callable(self.metric):
                        _m = dist.named_distances[self.metric]
                        dmat = dist.pairwise_special_metric(
                            X[index].toarray(),
                            metric=_m,
                            kwds=self._metric_kwds,
                        )
                    else:
                        dmat = dist.pairwise_special_metric(
                            X[index],
                            metric=self._input_distance_func,
                            kwds=self._metric_kwds,
                        )
                else:
                    dmat = dist.pairwise_special_metric(
                        X[index],
                        metric=self._input_distance_func,
                        kwds=self._metric_kwds,
                    )
            (
                self.graph_,
                self._sigmas,
                self._rhos,
                self.graph_dists_,
            ) = fuzzy_simplicial_set(
                dmat,
                self._n_neighbors,
                random_state,
                "precomputed",
                self._metric_kwds,
                None,
                None,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                self.densmap or self.output_dens,
            )
        else:
            # Standard case
            self._small_data = False
            # Standard case
            if self._sparse_data and self.metric in pynn_sparse_named_distances:
                nn_metric = self.metric
            elif not self._sparse_data and self.metric in pynn_named_distances:
                nn_metric = self.metric
            else:
                nn_metric = self._input_distance_func

            (
                self._knn_indices,
                self._knn_dists,
                self._knn_search_index,
            ) = nearest_neighbors(
                X[index],
                self._n_neighbors,
                nn_metric,
                self._metric_kwds,
                self.angular_rp_forest,
                random_state,
                self.low_memory,
                use_pynndescent=True,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )

            (
                self.graph_,
                self._sigmas,
                self._rhos,
                self.graph_dists_,
            ) = fuzzy_simplicial_set(
                X[index],
                self.n_neighbors,
                random_state,
                nn_metric,
                self._metric_kwds,
                self._knn_indices,
                self._knn_dists,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                self.densmap or self.output_dens,
            )

        # Currently not checking if any duplicate points have differing labels
        # Might be worth throwing a warning...
        if y is not None:
            if self.densmap:
                raise NotImplementedError(
                    "Supervised embedding is not supported with densMAP.")

            len_X = len(X) if not self._sparse_data else X.shape[0]
            if len_X != len(y):
                raise ValueError(
                    "Length of x = {len_x}, length of y = {len_y}, while it must be equal."
                    .format(len_x=len_X, len_y=len(y)))
            y_ = check_array(y, ensure_2d=False)[index]
            if self.target_metric == "categorical":
                if self.target_weight < 1.0:
                    far_dist = 2.5 * (1.0 / (1.0 - self.target_weight))
                else:
                    far_dist = 1.0e12
                self.graph_ = discrete_metric_simplicial_set_intersection(
                    self.graph_, y_, far_dist=far_dist)
            elif self.target_metric in dist.DISCRETE_METRICS:
                if self.target_weight < 1.0:
                    scale = 2.5 * (1.0 / (1.0 - self.target_weight))
                else:
                    scale = 1.0e12
                # self.graph_ = discrete_metric_simplicial_set_intersection(
                #     self.graph_,
                #     y_,
                #     metric=self.target_metric,
                #     metric_kws=self.target_metric_kwds,
                #     metric_scale=scale
                # )

                metric_kws = dist.get_discrete_params(y_, self.target_metric)

                self.graph_ = discrete_metric_simplicial_set_intersection(
                    self.graph_,
                    y_,
                    metric=self.target_metric,
                    metric_kws=metric_kws,
                    metric_scale=scale,
                )
            else:
                if len(y_.shape) == 1:
                    y_ = y_.reshape(-1, 1)
                if self.target_n_neighbors == -1:
                    target_n_neighbors = self._n_neighbors
                else:
                    target_n_neighbors = self.target_n_neighbors

                # Handle the small case as precomputed as before
                if y.shape[0] < 4096:
                    try:
                        ydmat = pairwise_distances(y_,
                                                   metric=self.target_metric,
                                                   **self._target_metric_kwds)
                    except (TypeError, ValueError):
                        ydmat = dist.pairwise_special_metric(
                            y_,
                            metric=self.target_metric,
                            kwds=self._target_metric_kwds,
                        )

                    target_graph, target_sigmas, target_rhos = fuzzy_simplicial_set(
                        ydmat,
                        target_n_neighbors,
                        random_state,
                        "precomputed",
                        self._target_metric_kwds,
                        None,
                        None,
                        False,
                        1.0,
                        1.0,
                        False,
                    )
                else:
                    # Standard case
                    target_graph, target_sigmas, target_rhos = fuzzy_simplicial_set(
                        y_,
                        target_n_neighbors,
                        random_state,
                        self.target_metric,
                        self._target_metric_kwds,
                        None,
                        None,
                        False,
                        1.0,
                        1.0,
                        False,
                    )
                # product = self.graph_.multiply(target_graph)
                # # self.graph_ = 0.99 * product + 0.01 * (self.graph_ +
                # #                                        target_graph -
                # #                                        product)
                # self.graph_ = product
                self.graph_ = general_simplicial_set_intersection(
                    self.graph_, target_graph, self.target_weight)
                self.graph_ = reset_local_connectivity(self.graph_)
                self._supervised = True
        else:
            self._supervised = False

        # embed graph
        self.fit_embed_data(X, y, index, inverse)
        return self
コード例 #9
0
def component_layout(
    data,
    n_components,
    component_labels,
    dim,
    random_state,
    metric="euclidean",
    metric_kwds={},
):
    """Provide a layout relating the separate connected components. This is done
    by taking the centroid of each component and then performing a spectral embedding
    of the centroids.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data -- required so we can generate centroids for each
        connected component of the graph.

    n_components: int
        The number of distinct components to be layed out.

    component_labels: array of shape (n_samples)
        For each vertex in the graph the label of the component to
        which the vertex belongs.

    dim: int
        The chosen embedding dimension.

    metric: string or callable (optional, default 'euclidean')
        The metric used to measure distances among the source data points.

    metric_kwds: dict (optional, default {})
        Keyword arguments to be passed to the metric function.
        If metric is 'precomputed', 'linkage' keyword can be used to specify
        'average', 'complete', or 'single' linkage. Default is 'average'

    Returns
    -------
    component_embedding: array of shape (n_components, dim)
        The ``dim``-dimensional embedding of the ``n_components``-many
        connected components.
    """

    component_centroids = np.empty((n_components, data.shape[1]),
                                   dtype=np.float64)

    if metric == "precomputed":
        # cannot compute centroids from precomputed distances
        # instead, compute centroid distances using linkage
        distance_matrix = np.zeros((n_components, n_components),
                                   dtype=np.float64)
        linkage = metric_kwds.get("linkage", "average")
        if linkage == "average":
            linkage = np.mean
        elif linkage == "complete":
            linkage = np.max
        elif linkage == "single":
            linkage = np.min
        else:
            raise ValueError("Unrecognized linkage '%s'. Please choose from "
                             "'average', 'complete', or 'single'" % linkage)
        for c_i in range(n_components):
            dm_i = data[component_labels == c_i]
            for c_j in range(c_i + 1, n_components):
                dist = linkage(dm_i[:, component_labels == c_j])
                distance_matrix[c_i, c_j] = dist
                distance_matrix[c_j, c_i] = dist
    else:
        for label in range(n_components):
            component_centroids[label] = data[component_labels == label].mean(
                axis=0)
        if metric in ("hellinger", "ll_dirichlet"):
            distance_matrix = pairwise_special_metric(component_centroids,
                                                      metric=metric)
        else:
            distance_matrix = pairwise_distances(component_centroids,
                                                 metric=metric,
                                                 **metric_kwds)

    affinity_matrix = np.exp(-(distance_matrix**2))

    component_embedding = SpectralEmbedding(
        n_components=dim, affinity="precomputed",
        random_state=random_state).fit_transform(affinity_matrix)
    component_embedding /= component_embedding.max()

    return component_embedding