def torgerson(distances, n_components=2, eigen_solver="auto"): """ Perform classical mds (Torgerson-Gower scaling). ..note :: If the distances are euclidean then this is equivalent to projecting the original data points to the first `n` principal components. Parameters ---------- distances : (N, N) ndarray Input distance (dissimilarity) matrix. n_components : int Number of components to return eigen_solver : str One of `lapack`, `arpack` or `'auto'`. The later chooses between the former based on the input. See Also -------- `cmdscale` in R """ distances = np.asarray(distances) assert distances.shape[0] == distances.shape[1] N = distances.shape[0] # O ^ 2 D_sq = distances**2 # double center the D_sq rsum = np.sum(D_sq, axis=1, keepdims=True) csum = np.sum(D_sq, axis=0, keepdims=True) total = np.sum(csum) D_sq -= rsum / N D_sq -= csum / N D_sq += total / (N**2) B = np.multiply(D_sq, -0.5, out=D_sq) if eigen_solver == 'auto': if N > 200 and n_components < 10: # arbitrary - follow skl KernelPCA eigen_solver = 'arpack' else: eigen_solver = 'lapack' if eigen_solver == "arpack": v0 = np.random.RandomState(0xD06).uniform(-1, 1, B.shape[0]) w, v = arpack_eigh(B, k=n_components, v0=v0) assert np.all(np.diff(w) >= 0), "w was not in ascending order" U, L = v[:, ::-1], w[::-1] elif eigen_solver == "lapack": # lapack (d|s)syevr w, v = lapack_eigh(B, overwrite_a=True, eigvals=(max(N - n_components, 0), N - 1)) assert np.all(np.diff(w) >= 0), "w was not in ascending order" U, L = v[:, ::-1], w[::-1] else: raise ValueError(eigen_solver) assert L.shape == (min(n_components, N), ) assert U.shape == (N, min(n_components, N)) # Warn for (sufficiently) negative eig values ... neg = L < -5 * np.finfo(L.dtype).eps if np.any(neg): warnings.warn( ("{} of the {} eigenvalues were negative.".format( np.sum(neg), L.size)), UserWarning, stacklevel=2, ) # ... and clamp them all to 0 L[L < 0] = 0 if n_components > N: U = np.hstack((U, np.zeros((N, n_components - N)))) L = np.hstack((L, np.zeros((n_components - N)))) U = U[:, :n_components] L = L[:n_components] return U * np.sqrt(L.reshape((1, n_components)))
def torgerson(distances, n_components=2, eigen_solver="auto"): """ Perform classical mds (Torgerson-Gower scaling). ..note :: If the distances are euclidean then this is equivalent to projecting the original data points to the first `n` principal components. Parameters ---------- distances : (N, N) ndarray Input distance (dissimilarity) matrix. n_components : int Number of components to return eigen_solver : str One of `lapack`, `arpack` or `'auto'`. The later chooses between the former based on the input. See Also -------- `cmdscale` in R """ distances = np.asarray(distances) assert distances.shape[0] == distances.shape[1] N = distances.shape[0] # O ^ 2 D_sq = distances ** 2 # double center the D_sq rsum = np.sum(D_sq, axis=1, keepdims=True) csum = np.sum(D_sq, axis=0, keepdims=True) total = np.sum(csum) D_sq -= rsum / N D_sq -= csum / N D_sq += total / (N ** 2) B = np.multiply(D_sq, -0.5, out=D_sq) if eigen_solver == 'auto': if N > 200 and n_components < 10: # arbitrary - follow skl KernelPCA eigen_solver = 'arpack' else: eigen_solver = 'lapack' if eigen_solver == "arpack": v0 = np.random.RandomState(0xD06).uniform(-1, 1, B.shape[0]) w, v = arpack_eigh(B, k=n_components, v0=v0) assert np.all(np.diff(w) >= 0), "w was not in ascending order" U, L = v[:, ::-1], w[::-1] elif eigen_solver == "lapack": # lapack (d|s)syevr w, v = lapack_eigh(B, overwrite_a=True, eigvals=(max(N - n_components, 0), N - 1)) assert np.all(np.diff(w) >= 0), "w was not in ascending order" U, L = v[:, ::-1], w[::-1] else: raise ValueError(eigen_solver) assert L.shape == (min(n_components, N),) assert U.shape == (N, min(n_components, N)) # Warn for (sufficiently) negative eig values ... neg = L < -5 * np.finfo(L.dtype).eps if np.any(neg): warnings.warn( ("{} of the {} eigenvalues were negative." .format(np.sum(neg), L.size)), UserWarning, stacklevel=2, ) # ... and clamp them all to 0 L[L < 0] = 0 if n_components > N: U = np.hstack((U, np.zeros((N, n_components - N)))) L = np.hstack((L, np.zeros((n_components - N)))) U = U[:, :n_components] L = L[:n_components] return U * np.sqrt(L.reshape((1, n_components)))