def draw(self, embedding_dim=2, standardized=False, device="cpu", verbose=False): """Draw a graph in the Cartesian plane. This method does some basic preprocessing, constructs an MDE problem that is often suitable for drawing graphs, and computes/returns an embedding by approximately solving the MDE problem. Arguments --------- embedding_dim: int The number of dimemsions, 1, 2, or 3. standardized: bool Whether to impose a standardization constraint. device: str Device on which to compute/store embedding, 'cpu' or 'cuda'. verbose: bool Whether to print verbose output. Returns ------- torch.Tensor The embedding, of shape ``(n_items, embedding_dim)`` """ if (self.distances < 0).any(): raise ValueError( "Graphs with negative edge weights cannot be drawn.") if self.n_edges < 1e7 and self.n_all_edges > 1e7: retain_fraction = 1e7 / self.n_all_edges distance_graph = shortest_paths(self, retain_fraction=retain_fraction, verbose=verbose) else: distance_graph = self if not standardized: constraint = constraints.Centered() f = losses.WeightedQuadratic(distance_graph.distances) else: constraint = constraints.Standardized() # TODO(akshayka) better weights f = penalties.Cubic(1 / distance_graph.distances) mde = problem.MDE( n_items=self.n_items, embedding_dim=embedding_dim, edges=distance_graph.edges, distortion_function=f, constraint=constraint, device=device, ) X = mde.embed(verbose=verbose) mde.plot(edges=self.edges) return X
def preserve_distances( data, embedding_dim=2, loss=losses.Absolute, constraint=None, max_distances=5e7, device="cpu", verbose=False, ) -> problem.MDE: """Construct an MDE problem based on original distances. This function constructs an MDE problem for preserving pairwise distances between items. This can be useful for preserving the global structure of the data. The data can be specified with either a data matrix (a NumPy array, torch Tensor, or sparse matrix), or a ``pymde.Graph`` instance encoding the distances: A NumPy array, torch tensor, or sparse matrix is interpreted as a collection of feature vectors: each row gives the feature vector for an item. The original distances are the Euclidean distances between the feature vectors. A ``pymde.Graph`` instance is interpreted as encoding all (n_items choose 2) distances: the distance between i and j is taken to be the length of the shortest path connecting i and j. When the number of items n_items is large, the total number of pairs will be very large. When this happens, instead of computing all pairs of distances, this function will sample a subset uniformly at random. The maximum number of distances to compute is specified by the parameter ``max_distances``. Depending on how many items you have (and how much memory your machine has), you may need to adjust this parameter. To obtain an embedding, call the ``embed`` method on the returned object. To plot it, use ``pymde.plot``. For example: .. code:: python3 embedding = pymde.preserve_distances(data).embed() pymde.plot(embedding) Arguments --------- data: {np.ndarray, torch.Tensor, scipy.sparse matrix}( shape=(n_items, n_features)), or pymde.Graph A data matrix or a ``pymde.Graph`` instance. embedding_dim: int The embedding dimension. loss: pymde.Function class (or factory) Callable that constructs a distortion function, given original distances. Typically one of the classes defined in ``pymde.losses``, such as ``pymde.losses.Absolute``, or ``pymde.losses.WeightedQuadratic``. constraint: pymde.constraints.Constraint (optional) Embedding constraint, such as ``pymde.Standardized()`` or ``pymde.Anchored(anchors, values)`` (or ``None``). Defaults to no constraint. Note: when the constraint is ``pymde.Standardized()``, the original distances will be scaled by a constant (because the standardization constraint puts a limit on how large any one distance can be). max_distances: int Maximum number of distances to compute. device: str (optional) Device for the embedding (eg, 'cpu', 'cuda'). verbose: bool If ``True``, print verbose output. Returns ------- pymde.MDE A ``pymde.MDE`` instance, based on preserving the original distances. """ if not isinstance( data, (np.ndarray, torch.Tensor, preprocess.graph.Graph) ) and not scipy.sparse.issparse(data): raise ValueError( "`data` must be a np.ndarray/torch.Tensor/scipy.sparse matrix" ", or a pymde.Graph." ) if isinstance(data, preprocess.graph.Graph): n_items = data.n_items else: n_items = data.shape[0] n_all_edges = (n_items) * (n_items - 1) / 2 retain_fraction = max_distances / n_all_edges graph = preprocess.generic.distances( data, retain_fraction=retain_fraction, verbose=verbose ) edges = graph.edges.to(device) deviations = graph.distances.to(device) if constraint is None: constraint = constraints.Centered() elif isinstance(constraint, constraints._Standardized): deviations = preprocess.scale( deviations, constraint.natural_length(n_items, embedding_dim) ) return problem.MDE( n_items=n_items, embedding_dim=embedding_dim, edges=edges, distortion_function=loss(deviations), constraint=constraint, device=device, )
def preserve_neighbors( data, embedding_dim=2, attractive_penalty=penalties.Log1p, repulsive_penalty=penalties.Log, constraint=None, n_neighbors=None, repulsive_fraction=None, max_distance=None, init="quadratic", device="cpu", verbose=False, ) -> problem.MDE: """Construct an MDE problem designed to preserve local structure. This function constructs an MDE problem for preserving the local structure of original data. This MDE problem is well-suited for visualization (using ``embedding_dim`` 2 or 3), but can also be used to generate features for machine learning tasks (with ``embedding_dim`` = 10, 50, or 100, for example). It yields embeddings in which similar items are near each other, and dissimilar items are not near each other. The original data can either be a data matrix, or a graph. Data matrices should be torch Tensors, NumPy arrays, or scipy sparse matrices; graphs should be instances of ``pymde.Graph``. The MDE problem uses distortion functions derived from weights (i.e., penalties). To obtain an embedding, call the ``embed`` method on the returned ``MDE`` object. To plot it, use ``pymde.plot``. .. code:: python3 embedding = pymde.preserve_neighbors(data).embed() pymde.plot(embedding) Arguments --------- data: {torch.Tensor, numpy.ndarray, scipy.sparse matrix}( shape=(n_items, n_features)) or pymde.Graph The original data, a data matrix or a graph. Neighbors are computed using Euclidean distance if the data is a matrix, or the shortest-path metric if the data is a graph. embedding_dim: int The embedding dimension. Use 2 or 3 for visualization. attractive_penalty: pymde.Function class (or factory) Callable that constructs a distortion function, given positive weights. Typically one of the classes from ``pymde.penalties``, such as ``pymde.penalties.log1p``, ``pymde.penalties.Huber``, or ``pymde.penalties.Quadratic``. repulsive_penalty: pymde.Function class (or factory) Callable that constructs a distortion function, given negative weights. (If ``None``, only positive weights are used.) For example, ``pymde.penalties.Log`` or ``pymde.penalties.InversePower``. constraint: pymde.constraints.Constraint (optional) Embedding constraint, like ``pymde.Standardized()`` or ``pymde.Anchored(anchors, values)`` (or ``None``). Defaults to no constraint when a repulsive penalty is provided, otherwise defaults to ``pymde.Standardized()``. n_neighbors: int (optional) The number of nearest neighbors to compute for each row (item) of ``data``. A sensible value is chosen by default, depending on the number of items. repulsive_fraction: float (optional) How many repulsive edges to include, relative to the number of attractive edges. ``1`` means as many repulsive edges as attractive edges. The higher this number, the more uniformly spread out the embedding will be. Defaults to ``0.5`` for standardized embeddings, and ``1`` otherwise. (If ``repulsive_penalty`` is ``None``, this argument is ignored.) max_distance: float (optional) If not None, neighborhoods are restricted to have a radius no greater than ``max_distance``. init: str Initialization strategy; 'quadratic' or 'random'. device: str (optional) Device for the embedding (eg, 'cpu', 'cuda'). verbose: bool If ``True``, print verbose output. Returns ------- pymde.MDE A ``pymde.MDE`` object, based on the original data. """ if isinstance(data, preprocess.graph.Graph): n = data.n_items elif data.shape[0] <= 1: raise ValueError("The data matrix must have at least two rows.") else: n = data.shape[0] if n_neighbors is None: # target included edges to be ~1% of total number of edges n_choose_2 = n * (n - 1) / 2 n_neighbors = int(max(min(15, n_choose_2 * 0.01 / n), 5)) if n_neighbors > n: problem.LOGGER.warning( ( "Requested n_neighbors {0} > number of items {1}." " Setting n_neighbors to {2}" ).format(n_neighbors, n, n - 1) ) n_neighbors = n - 1 if constraint is None and repulsive_penalty is not None: constraint = constraints.Centered() elif constraint is None and repulsive_penalty is None: constraint = constraints.Standardized() if isinstance(data, preprocess.graph.Graph): # enforce a max distance, otherwise may very well run out of memory # when n_items is large if max_distance is None: max_distance = (3 * torch.quantile(data.distances, 0.75)).item() if verbose: problem.LOGGER.info( f"Computing {n_neighbors}-nearest neighbors, with " f"max_distance={max_distance}" ) knn_graph = preprocess.generic.k_nearest_neighbors( data, k=n_neighbors, max_distance=max_distance, verbose=verbose, ) edges = knn_graph.edges.to(device) weights = knn_graph.weights.to(device) if init == "quadratic": if verbose: problem.LOGGER.info("Computing quadratic initialization.") X_init = quadratic.spectral( n, embedding_dim, edges, weights, device=device ) elif init == "random": X_init = constraint.initialization(n, embedding_dim, device) else: raise ValueError( f"Unsupported value '{init}' for keyword argument `init`; " "the supported values are 'quadratic' and 'random'." ) if repulsive_penalty is not None: if repulsive_fraction is None: if isinstance(constraint, constraints._Standardized): repulsive_fraction = 0.5 else: repulsive_fraction = 1 n_repulsive = int(repulsive_fraction * edges.shape[0]) negative_edges = preprocess.sample_edges( n, n_repulsive, exclude=edges ).to(device) edges = torch.cat([edges, negative_edges]) negative_weights = -torch.ones( negative_edges.shape[0], dtype=X_init.dtype, device=device ) weights = torch.cat([weights, negative_weights]) f = penalties.PushAndPull( weights, attractive_penalty=attractive_penalty, repulsive_penalty=repulsive_penalty, ) else: f = attractive_penalty(weights) mde = problem.MDE( n_items=n, embedding_dim=embedding_dim, edges=edges, distortion_function=f, constraint=constraint, device=device, ) mde._X_init = X_init # TODO cache the graph for subsequent calls / constructor for MDE from graph distances = mde.distances(mde._X_init) if (distances == 0).any(): # pathological scenario in which at least two points overlap can yield # non-differentiable average distortion. perturb the initialization to # mitigate. mde._X_init += 1e-4 * torch.randn( mde._X_init.shape, device=mde._X_init.device, dtype=mde._X_init.dtype, ) return mde
def __init__( self, n_items: int, embedding_dim: int, edges: torch.Tensor, distortion_function: tp.Union[tp.Callable, StochasticFunction], constraint: tp.Optional[constraints.Constraint] = None, device: tp.Optional[str] = None, ): """Constructs an MDE problem. Arguments --------- n_items: int Number of things being embedded. embedding_dim: int Embedding dimension. edges: torch.Tensor(shape=(num_edges, 2), dtype=torch.int) Tensor, where each row is an edge (i, j) between two items; each edge should satisfy 0 <= i < j < n_items. In particular self-edges are not allowed. distortion_function: Callable or pymde.functions.StochasticFunction The vectorized distortion function, typically an instance of a class from ``pymde.penalties`` or ``pymde.losses`` however, this can be any Python callable that maps a torch.Tensor of embedding distances to a torch.Tensor of distortions. constraint: pymde.constraints.Constraint, optional A Constraint object, such as ``pymde.Standardized()`` Defaults to an unconstrained (centered) embedding. device: str, optional Name of device on which to store tensors/compute embedding, such as 'cpu' or 'cuda' for GPU. Default infers device from ``edges`` and ``distortion_function`` """ super(MDE, self).__init__() if device is None: if (isinstance(edges, torch.Tensor) and isinstance( distortion_function, torch.nn.Module)) and (str( edges.device) == str( _module_device(distortion_function))): device = edges.device else: device = "cpu" self.device = _canonical_device(device) if not isinstance(n_items, torch.Tensor): n_items = torch.tensor(n_items, device=self.device) elif str(n_items.device) != str(self.device): n_items = n_items.to(self.device) self.register_buffer("n_items", n_items) if not isinstance(embedding_dim, torch.Tensor): embedding_dim = torch.tensor(embedding_dim, device=self.device) elif str(embedding_dim.device) != str(self.device): embedding_dim = embedding_dim.to(self.device) self.register_buffer("embedding_dim", embedding_dim) if edges is None: if not isinstance(distortion_function, StochasticFunction): raise ValueError( "edges can only be None when using a stochastic function.") p = distortion_function.p else: if not isinstance(edges, torch.Tensor): edges = torch.tensor(edges, dtype=torch.int64, device=self.device) if (edges[:, 0] == edges[:, 1]).any(): offending = torch.where(edges[:, 0] == edges[:, 1])[0] raise ValueError( "The edge list must not contain self edges; the " "following rows were found to be self edges: ", offending.cpu().numpy(), ) if str(edges.device) != str(self.device): LOGGER.warning( "edges.device (%s) " "does not match requested device (%s); copying edges to " "requested device." % (edges.device, device)) edges = edges.to(self.device) p = torch.tensor(edges.shape[0], device=self.device) complete_graph_edges = n_items * (n_items - 1) // 2 if p is not None and p > complete_graph_edges: raise ValueError( "Your graph has more than (n_items choose 2) edges." "(p: {0}, n_items choose 2: {1})".format( p, complete_graph_edges)) self.register_buffer("edges", edges) self.register_buffer("p", p) self.register_buffer("_complete_graph_edges", complete_graph_edges) if edges is not None: self.register_buffer( "_lhs", _gather_indices(edges[:, 0], self.embedding_dim)) self.register_buffer( "_rhs", _gather_indices(edges[:, 1], self.embedding_dim)) if isinstance(distortion_function, torch.nn.Module): f_device = _module_device(distortion_function) if f_device is None or str(f_device) != str(self.device): LOGGER.warning( "distortion_function device (%s) " "does not match requested device (%s); making a copy of " "distortion_function" % (str(f_device), device)) distortion_function = copy.deepcopy(distortion_function) distortion_function.to(self.device) self.distortion_function = distortion_function if constraint is None: constraint = constraints.Centered() self.constraint = constraint self.register_buffer("X", None) self.register_buffer("_X_init", None) self.solve_stats = None self.value = None self.residual_norm = None