def ripser(X, maxdim=1, thresh=np.inf, coeff=2, distance_matrix=False, do_cocycles=False, metric='euclidean'): """ Compute persistence diagrams for X data array. If X is not a distance matrix, it will be converted to a distance matrix using the chosen metric. Parameters ---------- X: ndarray (n_samples, n_features) A numpy array of either data or distance matrix. Can also be a sparse distance matrix of type scipy.sparse maxdim : int, optional, default 1 Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value. For 1, H_0 and H_1 will be computed. thresh : float, default infinity Maximum distances considered when constructing filtration. If infinity, compute the entire filtration. coeff : int prime, default 2 Compute homology with coefficients in the prime field Z/pZ for p=coeff. distance_matrix: bool Indicator that X is a distance matrix, if not we compute a distance matrix from X using the chosen metric. do_cocycles: bool Indicator of whether to compute cocycles, if so, we compute and store cocycles in the cocycles_ dictionary Rips member variable metric: string or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options specified in PAIRED_DISTANCES, including "euclidean", "manhattan", or "cosine". Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. Return ------ A dictionary holding all of the results of the computation {'dgms': list (size maxdim) of ndarray (n_pairs, 2) A list of persistence diagrams, one for each dimension less than maxdim. Each diagram is an ndarray of size (n_pairs, 2) with the first column representing the birth time and the second column representing the death time of each pair. 'cocycles': list (size maxdim) A list of representative cocycles in each dimension. The list in each dimension is parallel to the diagram in that dimension. 'num_edges': int The number of edges added during the computation 'dm' : ndarray (n_samples, n_samples) The distance matrix used in the computation } Examples -------- ``` from ripser import ripser, plot_dgms from sklearn import datasets data = datasets.make_circles(n_samples=110)[0] dgms = ripser(data)['dgms'] plot_dgms(dgms) """ if not distance_matrix: if X.shape[0] == X.shape[1]: warnings.warn( "The input matrix is square, but the distance_matrix flag is off. Did you mean to indicate that this was a distance matrix?" ) elif X.shape[0] < X.shape[1]: warnings.warn( "The input point cloud has more columns than rows; did you mean to transpose?" ) X = pairwise_distances(X, metric=metric) if not (X.shape[0] == X.shape[1]): raise Exception('Distance matrix is not square') dm = X n_points = dm.shape[0] if sparse.issparse(dm): coo = sparse.coo_matrix.astype(dm.tocoo(), dtype=np.float32) res = DRFDMSparse(coo.row, coo.col, coo.data, n_points, maxdim, thresh, coeff, int(do_cocycles)) else: I, J = np.meshgrid(np.arange(n_points), np.arange(n_points)) DParam = np.array(dm[I > J], dtype=np.float32) res = DRFDM(DParam, maxdim, thresh, coeff, int(do_cocycles)) # Unwrap persistence diagrams dgms = res['births_and_deaths_by_dim'] for dim in range(len(dgms)): N = int(len(dgms[dim]) / 2) dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2]) # Unwrap cocycles cocycles = [] for dim in range(len(res['cocycles_by_dim'])): cocycles.append([]) for j in range(len(res['cocycles_by_dim'][dim])): ccl = res['cocycles_by_dim'][dim][j] n = int(len(ccl) / (dim + 2)) ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2]) ccl[:, -1] = np.mod(ccl[:, -1], coeff) cocycles[dim].append(ccl) ret = { 'dgms': dgms, 'cocycles': cocycles, 'num_edges': res['num_edges'], 'dm': dm } return ret
def ripser( X, maxdim=1, thresh=np.inf, coeff=2, distance_matrix=False, do_cocycles=False, do_cycles=False, metric="euclidean", n_perm=None, ): """Compute persistence diagrams for X. X can be a data set of points or a distance matrix. When using a data set as X it will be converted to a distance matrix using the metric specified. Parameters ---------- X : ndarray (n_samples, n_features) A numpy array of either data or distance matrix (also pass `distance_matrix=True`). Can also be a sparse distance matrix of type scipy.sparse maxdim: int, optional, default 1 Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value. For 1, H_0 and H_1 will be computed. thresh: float, default infinity Maximum distances considered when constructing filtration. If infinity, compute the entire filtration. coeff: int prime, default 2 Compute homology with coefficients in the prime field Z/pZ for p=coeff. distance_matrix: bool, optional, default False When True the input matrix X will be considered a distance matrix. do_cocycles: bool, optional, default False Computed cocycles will be available in the `cocycles` value of the return dictionary. do_cycles: bool, optional, default False Computed cycles will be available in the `cycles` value of the return dictionary. metric: string or callable, optional, default "euclidean" Use this metric to compute distances between rows of X. "euclidean", "manhattan" and "cosine" are already provided metrics to choose from by using their name. You can provide a callable function and it will be used with two rows as arguments, it will be called once for each pair of rows in X. The computed distance will be available in the result dictionary under the key `dperm2all`. n_perm: int, optional, default None The number of points to subsample in a "greedy permutation," or a furthest point sampling of the points. These points will be used in lieu of the full point cloud for a faster computation, at the expense of some accuracy, which can be bounded as a maximum bottleneck distance to all diagrams on the original point set Returns ------- dict The result of the computation. .. note:: Each list in `dgms` has a relative list in `cocycles`. >>> r = ripser(...) For each dimension ``d`` and index ``k`` then ``r['dgms'][d][k]`` is the barcode associated to the representative cocycle ``r['cocycles'][d][k]``. The keys available in the dictionary are the: * ``dgms``: list (size maxdim) of ndarray (n_pairs, 2) For each dimension less than ``maxdim`` a list of persistence diagrams. Each persistent diagram is a pair (birth time, death time). * ``cycles``: list (size maxdim) of list of ndarray For each dimension less than ``maxdim`` a list of cycles. Each cycle in dimension ``d`` is represented as a ndarray of ``(k,d+1)`` elements. Each non zero value of the cycle is laid out in a row, with each of the vertices making up the cycle. * ``cocycles``: list (size maxdim) of list of ndarray For each dimension less than ``maxdim`` a list of representative cocycles. Each representative cocycle in dimension ``d`` is represented as a ndarray of ``(k,d+1)`` elements. Each non zero value of the cocycle is laid out in a row, first the ``d`` indices of the simplex and then the value of the cocycle on the simplex. The indices of the simplex reference the original point cloud, even if a greedy permutation was used. * ``num_edges``: int The number of edges added during the computation * ``dperm2all``: ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm The distance matrix used during the computation. When ``n_perm`` is not None the distance matrix will only refers to the subsampled dataset. * ``idx_perm``: ndarray(n_perm) if ``n_perm`` > 0 Index into the original point cloud of the points used as a subsample in the greedy permutation >>> r = ripser(X, n_perm=k) >>> subsampling = X[r['idx_perm']] * 'r_cover': float Covering radius of the subsampled points. If ``n_perm <= 0``, then the full point cloud was used and this is 0 Examples -------- .. code:: python from ripser import ripser, plot_dgms from sklearn import datasets from persim import plot_diagrams data = datasets.make_circles(n_samples=110)[0] dgms = ripser(data, cycles=True)['dgms'] plot_diagrams(dgms, show = True) Raises ------ ValueError If the distance matrix is not square. ValueError When using both a greedy permutation and a sparse distance matrix. ValueError When `n_perm` value is bigger than the number of rows in the matrix. ValueError When `n_perm` is non positive. Warns ---- When using a square matrix without toggling `distance_matrix` to True. When there are more columns than rows (as each row is a different data point). """ dim_0_pairs = [] cycles = [] if distance_matrix: if not (X.shape[0] == X.shape[1]): raise ValueError("Distance matrix is not square") else: if X.shape[0] == X.shape[1]: warnings.warn( "The input matrix is square, but the distance_matrix " + "flag is off. Did you mean to indicate that " + "this was a distance matrix?") elif X.shape[0] < X.shape[1]: warnings.warn( "The input point cloud has more columns than rows; " + "did you mean to transpose?") if n_perm and distance_matrix and sparse.issparse(X): raise ValueError( "Greedy permutation is not supported for sparse distance matrices") if n_perm and n_perm > X.shape[0]: raise ValueError("Number of points in greedy permutation is greater" + " than number of points in the point cloud") if n_perm and n_perm < 0: raise ValueError( "Should be a strictly positive number of points in the greedy permutation" ) idx_perm = np.arange(X.shape[0]) r_cover = 0.0 doing_permutation = False if n_perm and n_perm < X.shape[0]: doing_permutation = True idx_perm, lambdas, dperm2all = get_greedy_perm( X, n_perm=n_perm, distance_matrix=distance_matrix, metric=metric) r_cover = lambdas[-1] dm = dperm2all[:, idx_perm] else: if distance_matrix: dm = X else: dm = pairwise_distances(X, metric=metric) dperm2all = dm n_points = dm.shape[0] if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0: # If any of the diagonal elements are nonzero, # convert to sparse format, because currently # that's the only format that handles nonzero # births dm = sparse.coo_matrix(dm) if sparse.issparse(dm): if sparse.isspmatrix_coo(dm): # If the matrix is already COO, we need to order the row and column indices # lexicographically to avoid errors. See issue #103 row, col, data = dm.row, dm.col, dm.data lex_sort_idx = np.lexsort((col, row)) row, col, data = row[lex_sort_idx], col[lex_sort_idx], data[ lex_sort_idx] else: # Lexicographic ordering is performed by scipy upon conversion to COO coo = dm.tocoo() row, col, data = coo.row, coo.col, coo.data if do_cycles: res = DRFDMSparseCycles( row.astype(dtype=np.int32, order="C"), col.astype(dtype=np.int32, order="C"), np.array(data, dtype=np.float32, order="C"), n_points, maxdim, thresh, coeff, ) else: res = DRFDMSparse( row.astype(dtype=np.int32, order="C"), col.astype(dtype=np.int32, order="C"), np.array(data, dtype=np.float32, order="C"), n_points, maxdim, thresh, coeff, ) else: I, J = np.meshgrid(np.arange(n_points), np.arange(n_points)) DParam = np.array(dm[I > J], dtype=np.float32) if do_cycles: res = DRFDMCycles(DParam, maxdim, thresh, coeff) else: res = DRFDM(DParam, maxdim, thresh, coeff, do_cocycles) # # print(res) # Unwrap persistence diagrams dgms = res["births_and_deaths_by_dim"] for dim in range(len(dgms)): N = int(len(dgms[dim]) / 2) # print(dgms[dim]) dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2]) # Unwrap cycles if calculated if do_cycles: for dim in range(len(res["cycles_by_dim"])): cycles.append([]) for j in range(len(res["cycles_by_dim"][dim])): ccl = res["cycles_by_dim"][dim][j] n = int(len(ccl) / 2) ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, 2]) ccl = np.concatenate((ccl[:1], ccl[2:], ccl[1].reshape(1, -1)), axis=0) # ccl[:, -1] = np.mod(ccl[:, -1], coeff) # if doing_permutation: # Retain original indices in the point cloud # ccl[:, 0:-1] = idx_perm[ccl[:, 0:-1]] cycles[dim].append(ccl) pairs = np.array(res["dim_0_pairs"]) if len(pairs) % 2 == 0: pairs = np.append(pairs, np.array([0, np.nan])) else: pairs = np.append(pairs, np.array([np.nan])) dim_0_pairs = np.reshape(pairs, (int(len(pairs) / 2), 2)) ret = { "dgms": dgms, "dim_0_pairs": dim_0_pairs, "cycles": cycles, "num_edges": res["num_edges"], "dperm2all": dperm2all, "idx_perm": idx_perm, "r_cover": r_cover, } else: # Unwrap cocycles cocycles = [] for dim in range(len(res["cocycles_by_dim"])): cocycles.append([]) for j in range(len(res["cocycles_by_dim"][dim])): ccl = res["cocycles_by_dim"][dim][j] n = int(len(ccl) / (dim + 2)) ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2]) ccl[:, -1] = np.mod(ccl[:, -1], coeff) if doing_permutation: # Retain original indices in the point cloud ccl[:, 0:-1] = idx_perm[ccl[:, 0:-1]] cocycles[dim].append(ccl) ret = { "dgms": dgms, "cocycles": cocycles, "num_edges": res["num_edges"], "dperm2all": dperm2all, "idx_perm": idx_perm, "r_cover": r_cover, } return ret
def ripser( X, maxdim=1, thresh=np.inf, coeff=2, distance_matrix=False, do_cocycles=False, metric="euclidean", n_perm=None, ): """Compute persistence diagrams for X data array. If X is not a distance matrix, it will be converted to a distance matrix using the chosen metric. Parameters ---------- X: ndarray (n_samples, n_features) A numpy array of either data or distance matrix. Can also be a sparse distance matrix of type scipy.sparse maxdim: int, optional, default 1 Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value. For 1, H_0 and H_1 will be computed. thresh: float, default infinity Maximum distances considered when constructing filtration. If infinity, compute the entire filtration. coeff: int prime, default 2 Compute homology with coefficients in the prime field Z/pZ for p=coeff. distance_matrix: bool Indicator that X is a distance matrix, if not we compute a distance matrix from X using the chosen metric. do_cocycles: bool Indicator of whether to compute cocycles, if so, we compute and store cocycles in the `cocycles_` dictionary Rips member variable metric: string or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options specified in pairwise_distances, including "euclidean", "manhattan", or "cosine". Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_perm: int The number of points to subsample in a "greedy permutation," or a furthest point sampling of the points. These points will be used in lieu of the full point cloud for a faster computation, at the expense of some accuracy, which can be bounded as a maximum bottleneck distance to all diagrams on the original point set Returns ------- A dictionary holding all of the results of the computation {'dgms': list (size maxdim) of ndarray (n_pairs, 2) A list of persistence diagrams, one for each dimension less than maxdim. Each diagram is an ndarray of size (n_pairs, 2) with the first column representing the birth time and the second column representing the death time of each pair. 'cocycles': list (size maxdim) of list of ndarray A list of representative cocycles in each dimension. The list in each dimension is parallel to the diagram in that dimension; that is, each entry of the list is a representative cocycle of the corresponding point expressed as an ndarray(K, d+1), where K is the number of nonzero values of the cocycle and d is the dimension of the cocycle. The first d columns of each array index into the simplices of the (subsampled) point cloud, and the last column is the value of the cocycle at that simplex 'num_edges': int The number of edges added during the computation 'dperm2all': ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm The distance matrix used in the computation if n_perm is none. Otherwise, the distance from all points in the permutation to all points in the dataset 'idx_perm': ndarray(n_perm) if n_perm > 0 Index into the original point cloud of the points used as a subsample in the greedy permutation 'r_cover': float Covering radius of the subsampled points. If n_perm <= 0, then the full point cloud was used and this is 0 } Examples -------- .. code:: python from ripser import ripser, plot_dgms from sklearn import datasets data = datasets.make_circles(n_samples=110)[0] dgms = ripser(data)['dgms'] plot_dgms(dgms) """ if distance_matrix: if not (X.shape[0] == X.shape[1]): raise Exception("Distance matrix is not square") else: if X.shape[0] == X.shape[1]: warnings.warn( "The input matrix is square, but the distance_matrix " + "flag is off. Did you mean to indicate that " + "this was a distance matrix?") elif X.shape[0] < X.shape[1]: warnings.warn( "The input point cloud has more columns than rows; " + "did you mean to transpose?") if n_perm and distance_matrix and sparse.issparse(X): raise Exception( "Greedy permutation is not supported for sparse distance matrices") if n_perm and n_perm > X.shape[0]: raise Exception("Number of points in greedy permutation is greater" + " than number of points in the point cloud") if n_perm and n_perm < 0: raise Exception( "Should be a strictly positive number of points in the greedy permutation" ) idx_perm = np.arange(X.shape[0]) r_cover = 0.0 if n_perm: idx_perm, lambdas, dperm2all = get_greedy_perm( X, n_perm=n_perm, distance_matrix=distance_matrix, metric=metric) r_cover = lambdas[-1] dm = dperm2all[:, idx_perm] else: if distance_matrix: dm = X else: dm = pairwise_distances(X, metric=metric) dperm2all = dm n_points = dm.shape[0] if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0: # If any of the diagonal elements are nonzero, # convert to sparse format, because currently # that's the only format that handles nonzero # births dm = sparse.coo_matrix(dm) if sparse.issparse(dm): coo = dm.tocoo() res = DRFDMSparse( coo.row.astype(dtype=np.int32, order="C"), coo.col.astype(dtype=np.int32, order="C"), np.array(coo.data, dtype=np.float32, order="C"), n_points, maxdim, thresh, coeff, int(do_cocycles), ) else: I, J = np.meshgrid(np.arange(n_points), np.arange(n_points)) DParam = np.array(dm[I > J], dtype=np.float32) res = DRFDM(DParam, maxdim, thresh, coeff, int(do_cocycles)) # Unwrap persistence diagrams dgms = res["births_and_deaths_by_dim"] for dim in range(len(dgms)): N = int(len(dgms[dim]) / 2) dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2]) # Unwrap cocycles cocycles = [] for dim in range(len(res["cocycles_by_dim"])): cocycles.append([]) for j in range(len(res["cocycles_by_dim"][dim])): ccl = res["cocycles_by_dim"][dim][j] n = int(len(ccl) / (dim + 2)) ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2]) ccl[:, -1] = np.mod(ccl[:, -1], coeff) cocycles[dim].append(ccl) ret = { "dgms": dgms, "cocycles": cocycles, "num_edges": res["num_edges"], "dperm2all": dperm2all, "idx_perm": idx_perm, "r_cover": r_cover, } return ret