Example #1
0
def test_fit_spectral(device):
    # TODO deflake this test
    pytest.skip("This test is flaky on macOS.")
    np.random.seed(0)
    torch.random.manual_seed(0)
    n = 200
    m = 3
    max_iter = 1000

    edges = util.all_edges(n)
    weights = torch.ones(edges.shape[0])
    f = penalties.Quadratic(weights)

    mde = problem.MDE(
        n,
        m,
        edges=edges,
        distortion_function=f,
        constraint=Standardized(),
        device=device,
    )
    X = mde.embed(max_iter=max_iter, eps=1e-10, memory_size=10)

    assert id(X) == id(mde.X)
    X_spectral = quadratic.spectral(n,
                                    m,
                                    edges=edges,
                                    weights=weights,
                                    device=device)

    testing.assert_allclose(
        mde.average_distortion(X).detach().cpu().numpy(),
        mde.average_distortion(X_spectral).detach().cpu().numpy(),
        atol=1e-4,
    )
Example #2
0
    def draw(self,
             embedding_dim=2,
             standardized=False,
             device="cpu",
             verbose=False):
        """Draw a graph in the Cartesian plane.

        This method does some basic preprocessing, constructs an MDE problem
        that is often suitable for drawing graphs, and computes/returns an
        embedding by approximately solving the MDE problem.

        Arguments
        ---------
        embedding_dim: int
            The number of dimemsions, 1, 2, or 3.
        standardized: bool
            Whether to impose a standardization constraint.
        device: str
            Device on which to compute/store embedding, 'cpu' or 'cuda'.
        verbose: bool
            Whether to print verbose output.

        Returns
        -------
        torch.Tensor
            The embedding, of shape ``(n_items, embedding_dim)``
        """
        if (self.distances < 0).any():
            raise ValueError(
                "Graphs with negative edge weights cannot be drawn.")

        if self.n_edges < 1e7 and self.n_all_edges > 1e7:
            retain_fraction = 1e7 / self.n_all_edges
            distance_graph = shortest_paths(self,
                                            retain_fraction=retain_fraction,
                                            verbose=verbose)
        else:
            distance_graph = self

        if not standardized:
            constraint = constraints.Centered()
            f = losses.WeightedQuadratic(distance_graph.distances)
        else:
            constraint = constraints.Standardized()
            # TODO(akshayka) better weights
            f = penalties.Cubic(1 / distance_graph.distances)
        mde = problem.MDE(
            n_items=self.n_items,
            embedding_dim=embedding_dim,
            edges=distance_graph.edges,
            distortion_function=f,
            constraint=constraint,
            device=device,
        )
        X = mde.embed(verbose=verbose)
        mde.plot(edges=self.edges)
        return X
Example #3
0
def test_self_edges_raises_error(device):
    torch.random.manual_seed(0)
    edges = np.array([(0, 1), (0, 0), (0, 2), (1, 2), (1, 1)])
    with pytest.raises(ValueError,
                       match=r"The edge list must not contain self edges.*"):
        problem.MDE(
            3,
            3,
            edges,
            penalties.Quadratic(torch.ones(edges.shape[0])),
            constraint=Standardized(),
            device=device,
        )
Example #4
0
def test_differences(device):
    torch.random.manual_seed(0)
    edges = np.array([(0, 1), (0, 2), (1, 2)])
    X = torch.randn((3, 3), dtype=torch.float32, device=device)
    mde = problem.MDE(
        3,
        3,
        edges,
        penalties.Quadratic(torch.ones(3)),
        constraint=Standardized(),
        device=device,
    )
    diff = mde.differences(X)
    testing.assert_allclose(X[edges[:, 0]] - X[edges[:, 1]], diff)
Example #5
0
def test_norm_grad_zero(device):
    torch.random.manual_seed(0)
    edges = np.array([(0, 1)])
    mde = problem.MDE(
        3,
        3,
        edges,
        penalties.Quadratic(torch.ones(3)),
        constraint=Standardized(),
        device=device,
    )
    X = torch.ones((3, 3), requires_grad=True, device=device)
    norms = mde.distances(X)
    norms.backward()
    testing.assert_allclose(X.grad, 0.0)
Example #6
0
def _spectral(
    L,
    m,
    cg=False,
    max_iter=40,
    edges=None,
    weights=None,
    warm_start=False,
    device=None,
):
    n = L.shape[0]
    if not cg:
        k = m + 1
        num_lanczos_vectors = max(2 * k + 1, int(np.sqrt(L.shape[0])))
        eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
            L,
            k,
            which="SM",
            ncv=num_lanczos_vectors,
            tol=1e-4,
            v0=np.ones(L.shape[0]),
            maxiter=L.shape[0] * 5,
        )
        order = np.argsort(eigenvalues)[1:k]
    else:
        k = m
        if warm_start:
            mde = problem.MDE(
                n, m, edges, f=penalties.Quadratic(weights), device=device
            )
            X_init = mde.fit(max_iter=40, use_line_search=False)
        else:
            X_init = util.proj_standardized(
                torch.tensor(np.random.randn(n, m), device=device), demean=True
            )
        eigenvalues, eigenvectors = scipy.sparse.linalg.lobpcg(
            A=L,
            X=X_init.cpu().numpy(),
            # Y: search in the orthogonal complement of the ones vector
            Y=np.ones((L.shape[0], 1)),
            tol=None,
            # largest: find the smallest eigenvalues
            largest=False,
            maxiter=max_iter,
        )
        order = np.argsort(eigenvalues)[0:k]
    return eigenvectors[:, order]
Example #7
0
def test_average_distortion(device):
    torch.random.manual_seed(0)
    edges = np.array([(0, 1), (0, 2), (1, 2)])
    mde = problem.MDE(
        3,
        2,
        edges,
        penalties.Quadratic(torch.tensor([1.0, 2.0, 3.0])),
        constraint=Standardized(),
        device=device,
    )
    X = torch.tensor(
        [[0.0, 0.0], [1.0, 1.0], [3.0, 3.0]],
        dtype=torch.float32,
        device=device,
    )
    average_distortion = mde.average_distortion(X)
    # (1*2 + 2*18 + 3*8)/3 = (2 + 36 + 24)/3 = 62/3
    testing.assert_allclose(average_distortion.detach().cpu().numpy(),
                            62.0 / 3)
Example #8
0
def test_average_distortion_grad(device):
    torch.random.manual_seed(0)
    edges = np.array([(0, 1), (0, 2), (1, 2)])
    f = penalties.Quadratic(torch.tensor([1.0, 2.0, 3.0], device=device))
    mde = problem.MDE(3, 2, edges, f, Standardized(), device=device)
    X = torch.randn(
        (3, 2),
        requires_grad=True,
        dtype=torch.float32,
        device=device,
    )
    average_distortion = mde.average_distortion(X)
    average_distortion.backward()
    A = torch.tensor(
        [[1, 1, 0], [-1, 0, 1], [0, -1, -1]],
        device=device,
    ).float()
    auto_grad = X.grad
    X.grad = None
    util._distortion(X, f, A, mde._lhs, mde._rhs).backward()
    manual_grad = X.grad
    testing.assert_allclose(auto_grad, manual_grad)
Example #9
0
def preserve_distances(
    data,
    embedding_dim=2,
    loss=losses.Absolute,
    constraint=None,
    max_distances=5e7,
    device="cpu",
    verbose=False,
) -> problem.MDE:
    """Construct an MDE problem based on original distances.

    This function constructs an MDE problem for preserving pairwise
    distances between items. This can be useful for preserving the global
    structure of the data.

    The data can be specified with either a data matrix (a NumPy array, torch
    Tensor, or sparse matrix), or a ``pymde.Graph`` instance encoding the
    distances:

        A NumPy array, torch tensor, or sparse matrix is interpreted as a
        collection of feature vectors: each row gives the feature vector for an
        item. The original distances are the Euclidean distances between the
        feature vectors.

        A ``pymde.Graph`` instance is interpreted as encoding all (n_items
        choose 2) distances: the distance between i and j is taken to be the
        length of the shortest path connecting i and j.

    When the number of items n_items is large, the total number of pairs will
    be very large. When this happens, instead of computing all pairs of
    distances, this function will sample a subset uniformly at random. The
    maximum number of distances to compute is specified by the parameter
    ``max_distances``. Depending on how many items you have (and how much
    memory your machine has), you may need to adjust this parameter.

    To obtain an embedding, call the ``embed`` method on the returned object.
    To plot it, use ``pymde.plot``.

    For example:

    .. code:: python3

        embedding = pymde.preserve_distances(data).embed()
        pymde.plot(embedding)

    Arguments
    ---------
    data: {np.ndarray, torch.Tensor, scipy.sparse matrix}(
            shape=(n_items, n_features)), or pymde.Graph
        A data matrix or a ``pymde.Graph`` instance.
    embedding_dim: int
        The embedding dimension.
    loss: pymde.Function class (or factory)
        Callable that constructs a distortion function, given
        original distances. Typically one of the classes defined in
        ``pymde.losses``, such as ``pymde.losses.Absolute``, or
        ``pymde.losses.WeightedQuadratic``.
    constraint: pymde.constraints.Constraint (optional)
        Embedding constraint, such as ``pymde.Standardized()`` or
        ``pymde.Anchored(anchors, values)`` (or ``None``). Defaults to no
        constraint. Note: when the constraint is ``pymde.Standardized()``,
        the original distances will be scaled by a constant (because the
        standardization constraint puts a limit on how large any one
        distance can be).
    max_distances: int
        Maximum number of distances to compute.
    device: str (optional)
        Device for the embedding (eg, 'cpu', 'cuda').
    verbose: bool
        If ``True``, print verbose output.

    Returns
    -------
    pymde.MDE
        A ``pymde.MDE`` instance, based on preserving the original distances.
    """
    if not isinstance(
        data, (np.ndarray, torch.Tensor, preprocess.graph.Graph)
    ) and not scipy.sparse.issparse(data):
        raise ValueError(
            "`data` must be a np.ndarray/torch.Tensor/scipy.sparse matrix"
            ", or a pymde.Graph."
        )

    if isinstance(data, preprocess.graph.Graph):
        n_items = data.n_items
    else:
        n_items = data.shape[0]
    n_all_edges = (n_items) * (n_items - 1) / 2
    retain_fraction = max_distances / n_all_edges

    graph = preprocess.generic.distances(
        data, retain_fraction=retain_fraction, verbose=verbose
    )
    edges = graph.edges.to(device)
    deviations = graph.distances.to(device)

    if constraint is None:
        constraint = constraints.Centered()
    elif isinstance(constraint, constraints._Standardized):
        deviations = preprocess.scale(
            deviations, constraint.natural_length(n_items, embedding_dim)
        )

    return problem.MDE(
        n_items=n_items,
        embedding_dim=embedding_dim,
        edges=edges,
        distortion_function=loss(deviations),
        constraint=constraint,
        device=device,
    )
Example #10
0
def preserve_neighbors(
    data,
    embedding_dim=2,
    attractive_penalty=penalties.Log1p,
    repulsive_penalty=penalties.Log,
    constraint=None,
    n_neighbors=None,
    repulsive_fraction=None,
    max_distance=None,
    init="quadratic",
    device="cpu",
    verbose=False,
) -> problem.MDE:
    """Construct an MDE problem designed to preserve local structure.

    This function constructs an MDE problem for preserving the
    local structure of original data. This MDE problem is well-suited for
    visualization (using ``embedding_dim`` 2 or 3), but can also be used to
    generate features for machine learning tasks (with ``embedding_dim`` = 10,
    50, or 100, for example). It yields embeddings in which similar items
    are near each other, and dissimilar items are not near each other.

    The original data can either be a data matrix, or a graph.
    Data matrices should be torch Tensors, NumPy arrays, or scipy sparse
    matrices; graphs should be instances of ``pymde.Graph``.

    The MDE problem uses distortion functions derived from weights (i.e.,
    penalties).

    To obtain an embedding, call the ``embed`` method on the returned ``MDE``
    object. To plot it, use ``pymde.plot``.

    .. code:: python3

        embedding = pymde.preserve_neighbors(data).embed()
        pymde.plot(embedding)

    Arguments
    ---------
    data: {torch.Tensor, numpy.ndarray, scipy.sparse matrix}(
            shape=(n_items, n_features)) or pymde.Graph
        The original data, a data matrix or a graph. Neighbors are
        computed using Euclidean distance if the data is a matrix,
        or the shortest-path metric if the data is a graph.
    embedding_dim: int
        The embedding dimension. Use 2 or 3 for visualization.
    attractive_penalty: pymde.Function class (or factory)
        Callable that constructs a distortion function, given positive
        weights. Typically one of the classes from ``pymde.penalties``,
        such as ``pymde.penalties.log1p``, ``pymde.penalties.Huber``, or
        ``pymde.penalties.Quadratic``.
    repulsive_penalty: pymde.Function class (or factory)
        Callable that constructs a distortion function, given negative
        weights. (If ``None``, only positive weights are used.) For example,
        ``pymde.penalties.Log`` or ``pymde.penalties.InversePower``.
    constraint: pymde.constraints.Constraint (optional)
        Embedding constraint, like ``pymde.Standardized()`` or
        ``pymde.Anchored(anchors, values)`` (or ``None``). Defaults to no
        constraint when a repulsive penalty is provided, otherwise defaults to
        ``pymde.Standardized()``.
    n_neighbors: int (optional)
        The number of nearest neighbors to compute for each row (item) of
        ``data``. A sensible value is chosen by default, depending on the
        number of items.
    repulsive_fraction: float (optional)
        How many repulsive edges to include, relative to the number
        of attractive edges. ``1`` means as many repulsive edges as attractive
        edges. The higher this number, the more uniformly spread out the
        embedding will be. Defaults to ``0.5`` for standardized embeddings, and
        ``1`` otherwise. (If ``repulsive_penalty`` is ``None``, this argument
        is ignored.)
    max_distance: float (optional)
        If not None, neighborhoods are restricted to have a radius
        no greater than ``max_distance``.
    init: str
        Initialization strategy; 'quadratic' or 'random'.
    device: str (optional)
        Device for the embedding (eg, 'cpu', 'cuda').
    verbose: bool
        If ``True``, print verbose output.

    Returns
    -------
    pymde.MDE
        A ``pymde.MDE`` object, based on the original data.
    """
    if isinstance(data, preprocess.graph.Graph):
        n = data.n_items
    elif data.shape[0] <= 1:
        raise ValueError("The data matrix must have at least two rows.")
    else:
        n = data.shape[0]

    if n_neighbors is None:
        # target included edges to be ~1% of total number of edges
        n_choose_2 = n * (n - 1) / 2
        n_neighbors = int(max(min(15, n_choose_2 * 0.01 / n), 5))
    if n_neighbors > n:
        problem.LOGGER.warning(
            (
                "Requested n_neighbors {0} > number of items {1}."
                " Setting n_neighbors to {2}"
            ).format(n_neighbors, n, n - 1)
        )
        n_neighbors = n - 1

    if constraint is None and repulsive_penalty is not None:
        constraint = constraints.Centered()
    elif constraint is None and repulsive_penalty is None:
        constraint = constraints.Standardized()

    if isinstance(data, preprocess.graph.Graph):
        # enforce a max distance, otherwise may very well run out of memory
        # when n_items is large
        if max_distance is None:
            max_distance = (3 * torch.quantile(data.distances, 0.75)).item()

    if verbose:
        problem.LOGGER.info(
            f"Computing {n_neighbors}-nearest neighbors, with "
            f"max_distance={max_distance}"
        )

    knn_graph = preprocess.generic.k_nearest_neighbors(
        data,
        k=n_neighbors,
        max_distance=max_distance,
        verbose=verbose,
    )
    edges = knn_graph.edges.to(device)
    weights = knn_graph.weights.to(device)

    if init == "quadratic":
        if verbose:
            problem.LOGGER.info("Computing quadratic initialization.")
        X_init = quadratic.spectral(
            n, embedding_dim, edges, weights, device=device
        )
    elif init == "random":
        X_init = constraint.initialization(n, embedding_dim, device)
    else:
        raise ValueError(
            f"Unsupported value '{init}' for keyword argument `init`; "
            "the supported values are 'quadratic' and 'random'."
        )

    if repulsive_penalty is not None:
        if repulsive_fraction is None:
            if isinstance(constraint, constraints._Standardized):
                repulsive_fraction = 0.5
            else:
                repulsive_fraction = 1

        n_repulsive = int(repulsive_fraction * edges.shape[0])
        negative_edges = preprocess.sample_edges(
            n, n_repulsive, exclude=edges
        ).to(device)
        edges = torch.cat([edges, negative_edges])

        negative_weights = -torch.ones(
            negative_edges.shape[0], dtype=X_init.dtype, device=device
        )
        weights = torch.cat([weights, negative_weights])

        f = penalties.PushAndPull(
            weights,
            attractive_penalty=attractive_penalty,
            repulsive_penalty=repulsive_penalty,
        )
    else:
        f = attractive_penalty(weights)

    mde = problem.MDE(
        n_items=n,
        embedding_dim=embedding_dim,
        edges=edges,
        distortion_function=f,
        constraint=constraint,
        device=device,
    )
    mde._X_init = X_init

    # TODO cache the graph for subsequent calls / constructor for MDE from graph

    distances = mde.distances(mde._X_init)
    if (distances == 0).any():
        # pathological scenario in which at least two points overlap can yield
        # non-differentiable average distortion. perturb the initialization to
        # mitigate.
        mde._X_init += 1e-4 * torch.randn(
            mde._X_init.shape,
            device=mde._X_init.device,
            dtype=mde._X_init.dtype,
        )
    return mde