Example #1
0
def ripser(X,
           maxdim=1,
           thresh=np.inf,
           coeff=2,
           distance_matrix=False,
           do_cocycles=False,
           metric='euclidean'):
    """ Compute persistence diagrams for X data array. If X is not a distance matrix,
        it will be converted to a distance matrix using the chosen metric.

    Parameters
    ----------
    X: ndarray (n_samples, n_features)
        A numpy array of either data or distance matrix.
        Can also be a sparse distance matrix of type scipy.sparse

    maxdim : int, optional, default 1
        Maximum homology dimension computed. Will compute all dimensions lower than
        and equal to this value. For 1, H_0 and H_1 will be computed.

    thresh : float, default infinity
        Maximum distances considered when constructing filtration. If infinity, compute 
        the entire filtration.

    coeff : int prime, default 2
        Compute homology with coefficients in the prime field Z/pZ for p=coeff.

    distance_matrix: bool
        Indicator that X is a distance matrix, if not we compute a 
        distance matrix from X using the chosen metric.

    do_cocycles: bool
        Indicator of whether to compute cocycles, if so, we compute and store
        cocycles in the cocycles_ dictionary Rips member variable

    metric: string or callable
        The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options specified in PAIRED_DISTANCES, including "euclidean", "manhattan", or "cosine". Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them.

    Return
    ------
    A dictionary holding all of the results of the computation

    {'dgms': list (size maxdim) of ndarray (n_pairs, 2)
        A list of persistence diagrams, one for each dimension less than maxdim. Each diagram is an ndarray of size (n_pairs, 2) with the first column representing the birth time and the second column representing the death time of each pair.
     'cocycles': list (size maxdim)
        A list of representative cocycles in each dimension.  The list in each dimension is parallel to the diagram in that dimension.
     'num_edges': int
        The number of edges added during the computation
     'dm' : ndarray (n_samples, n_samples)
        The distance matrix used in the computation
    }

    Examples
    --------

    ```
    from ripser import ripser, plot_dgms
    from sklearn import datasets

    data = datasets.make_circles(n_samples=110)[0]
    dgms = ripser(data)['dgms']
    plot_dgms(dgms)

    """

    if not distance_matrix:
        if X.shape[0] == X.shape[1]:
            warnings.warn(
                "The input matrix is square, but the distance_matrix flag is off.  Did you mean to indicate that this was a distance matrix?"
            )
        elif X.shape[0] < X.shape[1]:
            warnings.warn(
                "The input point cloud has more columns than rows; did you mean to transpose?"
            )
        X = pairwise_distances(X, metric=metric)

    if not (X.shape[0] == X.shape[1]):
        raise Exception('Distance matrix is not square')
    dm = X
    n_points = dm.shape[0]

    if sparse.issparse(dm):
        coo = sparse.coo_matrix.astype(dm.tocoo(), dtype=np.float32)
        res = DRFDMSparse(coo.row, coo.col, coo.data, n_points, maxdim, thresh,
                          coeff, int(do_cocycles))
    else:
        I, J = np.meshgrid(np.arange(n_points), np.arange(n_points))
        DParam = np.array(dm[I > J], dtype=np.float32)
        res = DRFDM(DParam, maxdim, thresh, coeff, int(do_cocycles))

    # Unwrap persistence diagrams
    dgms = res['births_and_deaths_by_dim']
    for dim in range(len(dgms)):
        N = int(len(dgms[dim]) / 2)
        dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2])

    # Unwrap cocycles
    cocycles = []
    for dim in range(len(res['cocycles_by_dim'])):
        cocycles.append([])
        for j in range(len(res['cocycles_by_dim'][dim])):
            ccl = res['cocycles_by_dim'][dim][j]
            n = int(len(ccl) / (dim + 2))
            ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2])
            ccl[:, -1] = np.mod(ccl[:, -1], coeff)
            cocycles[dim].append(ccl)
    ret = {
        'dgms': dgms,
        'cocycles': cocycles,
        'num_edges': res['num_edges'],
        'dm': dm
    }
    return ret
Example #2
0
def ripser(
    X,
    maxdim=1,
    thresh=np.inf,
    coeff=2,
    distance_matrix=False,
    do_cocycles=False,
    do_cycles=False,
    metric="euclidean",
    n_perm=None,
):
    """Compute persistence diagrams for X.

    X can be a data set of points or a distance matrix. When using a data set
    as X it will be converted to a distance matrix using the metric specified.

    Parameters
    ----------

    X : ndarray (n_samples, n_features)
        A numpy array of either data or distance matrix (also pass `distance_matrix=True`). Can also be a sparse distance matrix of type scipy.sparse

    maxdim: int, optional, default 1
        Maximum homology dimension computed. Will compute all dimensions lower than and equal to this value.  For 1, H_0 and H_1 will be computed.

    thresh: float, default infinity
        Maximum distances considered when constructing filtration.  If infinity, compute the entire filtration.

    coeff: int prime, default 2
        Compute homology with coefficients in the prime field Z/pZ for p=coeff.

    distance_matrix: bool, optional, default False
        When True the input matrix X will be considered a distance matrix.

    do_cocycles: bool, optional, default False
        Computed cocycles will be available in the `cocycles` value
        of the return dictionary.

    do_cycles: bool, optional, default False
        Computed cycles will be available in the `cycles` value
        of the return dictionary.

    metric: string or callable, optional, default "euclidean"
        Use this metric to compute distances between rows of X.

        "euclidean", "manhattan" and "cosine" are already provided metrics
        to choose from by using their name.

        You can provide a callable function and it will be used with two
        rows as arguments, it will be called once for each pair of rows in X.

        The computed distance will be available in the result dictionary under
        the key `dperm2all`.
    
    n_perm: int, optional, default None
        The number of points to subsample in a "greedy permutation,"
        or a furthest point sampling of the points.  These points
        will be used in lieu of the full point cloud for a faster
        computation, at the expense of some accuracy, which can 
        be bounded as a maximum bottleneck distance to all diagrams
        on the original point set

    Returns
    -------
    dict
        The result of the computation.

        .. note::
            Each list in `dgms` has a relative list in `cocycles`.

            >>> r = ripser(...)

            For each dimension ``d`` and index ``k`` then ``r['dgms'][d][k]``
            is the barcode associated to the representative cocycle
            ``r['cocycles'][d][k]``.

        The keys available in the dictionary are the:

            * ``dgms``: list (size maxdim) of ndarray (n_pairs, 2)
                For each dimension less than ``maxdim`` a list of persistence diagrams.
                Each persistent diagram is a pair (birth time, death time).
            * ``cycles``: list (size maxdim) of list of ndarray
                For each dimension less than ``maxdim`` a list of cycles.
                Each cycle in dimension ``d`` is represented as a ndarray of 
                ``(k,d+1)`` elements. Each non zero value of the cycle
                is laid out in a row, with each of the vertices making up the cycle.
            * ``cocycles``: list (size maxdim) of list of ndarray
                For each dimension less than ``maxdim`` a list of representative cocycles.
                Each representative cocycle in dimension ``d`` is represented as a
                ndarray of ``(k,d+1)`` elements. Each non zero value of the cocycle
                is laid out in a row, first the ``d`` indices of the simplex and then
                the value of the cocycle on the simplex.  The indices of the simplex
                reference the original point cloud, even if a greedy permutation was used.
            * ``num_edges``: int
                The number of edges added during the computation
            * ``dperm2all``: ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm
                The distance matrix used during the computation. When ``n_perm``
                is not None the distance matrix will only refers to the subsampled
                dataset.
            * ``idx_perm``: ndarray(n_perm) if ``n_perm`` > 0
                Index into the original point cloud of the points used
                as a subsample in the greedy permutation

                    >>> r = ripser(X, n_perm=k)
                    >>> subsampling = X[r['idx_perm']]

            * 'r_cover': float
                Covering radius of the subsampled points.
                If ``n_perm <= 0``, then the full point cloud was used and this is 0

    Examples
    --------

    .. code:: python

        from ripser import ripser, plot_dgms
        from sklearn import datasets
        from persim import plot_diagrams

        data = datasets.make_circles(n_samples=110)[0]
        dgms = ripser(data, cycles=True)['dgms']
        plot_diagrams(dgms, show = True)

    Raises
    ------

    ValueError
        If the distance matrix is not square.

    ValueError
        When using both a greedy permutation and a sparse distance matrix.

    ValueError
        When `n_perm` value is bigger than the number of rows in the matrix.

    ValueError
        When `n_perm` is non positive.

    Warns
    ----

        When using a square matrix without toggling `distance_matrix` to True.

        When there are more columns than rows (as each row is a different data point).

    """
    dim_0_pairs = []
    cycles = []

    if distance_matrix:
        if not (X.shape[0] == X.shape[1]):
            raise ValueError("Distance matrix is not square")
    else:
        if X.shape[0] == X.shape[1]:
            warnings.warn(
                "The input matrix is square, but the distance_matrix " +
                "flag is off.  Did you mean to indicate that " +
                "this was a distance matrix?")
        elif X.shape[0] < X.shape[1]:
            warnings.warn(
                "The input point cloud has more columns than rows; " +
                "did you mean to transpose?")

    if n_perm and distance_matrix and sparse.issparse(X):
        raise ValueError(
            "Greedy permutation is not supported for sparse distance matrices")
    if n_perm and n_perm > X.shape[0]:
        raise ValueError("Number of points in greedy permutation is greater" +
                         " than number of points in the point cloud")
    if n_perm and n_perm < 0:
        raise ValueError(
            "Should be a strictly positive number of points in the greedy permutation"
        )

    idx_perm = np.arange(X.shape[0])
    r_cover = 0.0
    doing_permutation = False
    if n_perm and n_perm < X.shape[0]:
        doing_permutation = True
        idx_perm, lambdas, dperm2all = get_greedy_perm(
            X, n_perm=n_perm, distance_matrix=distance_matrix, metric=metric)
        r_cover = lambdas[-1]
        dm = dperm2all[:, idx_perm]
    else:
        if distance_matrix:
            dm = X
        else:
            dm = pairwise_distances(X, metric=metric)

        dperm2all = dm

    n_points = dm.shape[0]
    if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0:
        # If any of the diagonal elements are nonzero,
        # convert to sparse format, because currently
        # that's the only format that handles nonzero
        # births
        dm = sparse.coo_matrix(dm)

    if sparse.issparse(dm):
        if sparse.isspmatrix_coo(dm):
            # If the matrix is already COO, we need to order the row and column indices
            # lexicographically to avoid errors. See issue #103
            row, col, data = dm.row, dm.col, dm.data
            lex_sort_idx = np.lexsort((col, row))
            row, col, data = row[lex_sort_idx], col[lex_sort_idx], data[
                lex_sort_idx]
        else:
            # Lexicographic ordering is performed by scipy upon conversion to COO
            coo = dm.tocoo()
            row, col, data = coo.row, coo.col, coo.data

        if do_cycles:
            res = DRFDMSparseCycles(
                row.astype(dtype=np.int32, order="C"),
                col.astype(dtype=np.int32, order="C"),
                np.array(data, dtype=np.float32, order="C"),
                n_points,
                maxdim,
                thresh,
                coeff,
            )
        else:
            res = DRFDMSparse(
                row.astype(dtype=np.int32, order="C"),
                col.astype(dtype=np.int32, order="C"),
                np.array(data, dtype=np.float32, order="C"),
                n_points,
                maxdim,
                thresh,
                coeff,
            )
    else:
        I, J = np.meshgrid(np.arange(n_points), np.arange(n_points))
        DParam = np.array(dm[I > J], dtype=np.float32)
        if do_cycles:
            res = DRFDMCycles(DParam, maxdim, thresh, coeff)
        else:
            res = DRFDM(DParam, maxdim, thresh, coeff, do_cocycles)
    #
    # print(res)
    # Unwrap persistence diagrams
    dgms = res["births_and_deaths_by_dim"]
    for dim in range(len(dgms)):
        N = int(len(dgms[dim]) / 2)
        # print(dgms[dim])
        dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2])

    # Unwrap cycles if calculated
    if do_cycles:
        for dim in range(len(res["cycles_by_dim"])):
            cycles.append([])
            for j in range(len(res["cycles_by_dim"][dim])):
                ccl = res["cycles_by_dim"][dim][j]
                n = int(len(ccl) / 2)
                ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, 2])

                ccl = np.concatenate((ccl[:1], ccl[2:], ccl[1].reshape(1, -1)),
                                     axis=0)

                # ccl[:, -1] = np.mod(ccl[:, -1], coeff)
                # if doing_permutation:
                # Retain original indices in the point cloud
                # ccl[:, 0:-1] = idx_perm[ccl[:, 0:-1]]
                cycles[dim].append(ccl)
                pairs = np.array(res["dim_0_pairs"])

                if len(pairs) % 2 == 0:
                    pairs = np.append(pairs, np.array([0, np.nan]))
                else:
                    pairs = np.append(pairs, np.array([np.nan]))

                dim_0_pairs = np.reshape(pairs, (int(len(pairs) / 2), 2))

        ret = {
            "dgms": dgms,
            "dim_0_pairs": dim_0_pairs,
            "cycles": cycles,
            "num_edges": res["num_edges"],
            "dperm2all": dperm2all,
            "idx_perm": idx_perm,
            "r_cover": r_cover,
        }

    else:
        # Unwrap cocycles
        cocycles = []
        for dim in range(len(res["cocycles_by_dim"])):
            cocycles.append([])
            for j in range(len(res["cocycles_by_dim"][dim])):
                ccl = res["cocycles_by_dim"][dim][j]
                n = int(len(ccl) / (dim + 2))
                ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2])
                ccl[:, -1] = np.mod(ccl[:, -1], coeff)
                if doing_permutation:
                    # Retain original indices in the point cloud
                    ccl[:, 0:-1] = idx_perm[ccl[:, 0:-1]]
                cocycles[dim].append(ccl)

        ret = {
            "dgms": dgms,
            "cocycles": cocycles,
            "num_edges": res["num_edges"],
            "dperm2all": dperm2all,
            "idx_perm": idx_perm,
            "r_cover": r_cover,
        }

    return ret
Example #3
0
def ripser(
    X,
    maxdim=1,
    thresh=np.inf,
    coeff=2,
    distance_matrix=False,
    do_cocycles=False,
    metric="euclidean",
    n_perm=None,
):
    """Compute persistence diagrams for X data array. If X is not a distance matrix, it will be converted to a distance matrix using the chosen metric.

    Parameters
    ----------
    X: ndarray (n_samples, n_features)
        A numpy array of either data or distance matrix.
        Can also be a sparse distance matrix of type scipy.sparse

    maxdim: int, optional, default 1
        Maximum homology dimension computed. Will compute all dimensions 
        lower than and equal to this value. 
        For 1, H_0 and H_1 will be computed.

    thresh: float, default infinity
        Maximum distances considered when constructing filtration. 
        If infinity, compute the entire filtration.

    coeff: int prime, default 2
        Compute homology with coefficients in the prime field Z/pZ for p=coeff.

    distance_matrix: bool
        Indicator that X is a distance matrix, if not we compute a 
        distance matrix from X using the chosen metric.

    do_cocycles: bool
        Indicator of whether to compute cocycles, if so, we compute and store
        cocycles in the `cocycles_` dictionary Rips member variable

    metric: string or callable
        The metric to use when calculating distance between instances in a 
        feature array. If metric is a string, it must be one of the options 
        specified in pairwise_distances, including "euclidean", "manhattan", 
        or "cosine". Alternatively, if metric is a callable function, it is 
        called on each pair of instances (rows) and the resulting value 
        recorded. The callable should take two arrays from X as input and 
        return a value indicating the distance between them.
    
    n_perm: int
        The number of points to subsample in a "greedy permutation,"
        or a furthest point sampling of the points.  These points
        will be used in lieu of the full point cloud for a faster
        computation, at the expense of some accuracy, which can 
        be bounded as a maximum bottleneck distance to all diagrams
        on the original point set

    Returns
    -------
    A dictionary holding all of the results of the computation

    {'dgms': list (size maxdim) of ndarray (n_pairs, 2)
        A list of persistence diagrams, one for each dimension less 
        than maxdim. Each diagram is an ndarray of size (n_pairs, 2) 
        with the first column representing the birth time and the 
        second column representing the death time of each pair.
     'cocycles': list (size maxdim) of list of ndarray
        A list of representative cocycles in each dimension.  The list 
        in each dimension is parallel to the diagram in that dimension;
        that is, each entry of the list is a representative cocycle of
        the corresponding point expressed as an ndarray(K, d+1), where K is
        the number of nonzero values of the cocycle and d is the dimension
        of the cocycle.  The first d columns of each array index into
        the simplices of the (subsampled) point cloud, and the last column
        is the value of the cocycle at that simplex
     'num_edges': int
        The number of edges added during the computation
     'dperm2all': ndarray(n_samples, n_samples) or ndarray (n_perm, n_samples) if n_perm
        The distance matrix used in the computation if n_perm is none.
        Otherwise, the distance from all points in the permutation to
        all points in the dataset
     'idx_perm': ndarray(n_perm) if n_perm > 0
        Index into the original point cloud of the points used
        as a subsample in the greedy permutation
     'r_cover': float
        Covering radius of the subsampled points.  
        If n_perm <= 0, then the full point cloud was used and this is 0
    }

    Examples
    --------
    .. code:: python

        from ripser import ripser, plot_dgms
        from sklearn import datasets

        data = datasets.make_circles(n_samples=110)[0]
        dgms = ripser(data)['dgms']
        plot_dgms(dgms)
    """

    if distance_matrix:
        if not (X.shape[0] == X.shape[1]):
            raise Exception("Distance matrix is not square")
    else:
        if X.shape[0] == X.shape[1]:
            warnings.warn(
                "The input matrix is square, but the distance_matrix " +
                "flag is off.  Did you mean to indicate that " +
                "this was a distance matrix?")
        elif X.shape[0] < X.shape[1]:
            warnings.warn(
                "The input point cloud has more columns than rows; " +
                "did you mean to transpose?")

    if n_perm and distance_matrix and sparse.issparse(X):
        raise Exception(
            "Greedy permutation is not supported for sparse distance matrices")
    if n_perm and n_perm > X.shape[0]:
        raise Exception("Number of points in greedy permutation is greater" +
                        " than number of points in the point cloud")
    if n_perm and n_perm < 0:
        raise Exception(
            "Should be a strictly positive number of points in the greedy permutation"
        )

    idx_perm = np.arange(X.shape[0])
    r_cover = 0.0
    if n_perm:
        idx_perm, lambdas, dperm2all = get_greedy_perm(
            X, n_perm=n_perm, distance_matrix=distance_matrix, metric=metric)
        r_cover = lambdas[-1]
        dm = dperm2all[:, idx_perm]
    else:
        if distance_matrix:
            dm = X
        else:
            dm = pairwise_distances(X, metric=metric)
        dperm2all = dm

    n_points = dm.shape[0]
    if not sparse.issparse(dm) and np.sum(np.abs(dm.diagonal()) > 0) > 0:
        # If any of the diagonal elements are nonzero,
        # convert to sparse format, because currently
        # that's the only format that handles nonzero
        # births
        dm = sparse.coo_matrix(dm)

    if sparse.issparse(dm):
        coo = dm.tocoo()
        res = DRFDMSparse(
            coo.row.astype(dtype=np.int32, order="C"),
            coo.col.astype(dtype=np.int32, order="C"),
            np.array(coo.data, dtype=np.float32, order="C"),
            n_points,
            maxdim,
            thresh,
            coeff,
            int(do_cocycles),
        )
    else:
        I, J = np.meshgrid(np.arange(n_points), np.arange(n_points))
        DParam = np.array(dm[I > J], dtype=np.float32)
        res = DRFDM(DParam, maxdim, thresh, coeff, int(do_cocycles))

    # Unwrap persistence diagrams
    dgms = res["births_and_deaths_by_dim"]
    for dim in range(len(dgms)):
        N = int(len(dgms[dim]) / 2)
        dgms[dim] = np.reshape(np.array(dgms[dim]), [N, 2])

    # Unwrap cocycles
    cocycles = []
    for dim in range(len(res["cocycles_by_dim"])):
        cocycles.append([])
        for j in range(len(res["cocycles_by_dim"][dim])):
            ccl = res["cocycles_by_dim"][dim][j]
            n = int(len(ccl) / (dim + 2))
            ccl = np.reshape(np.array(ccl, dtype=np.int64), [n, dim + 2])
            ccl[:, -1] = np.mod(ccl[:, -1], coeff)
            cocycles[dim].append(ccl)
    ret = {
        "dgms": dgms,
        "cocycles": cocycles,
        "num_edges": res["num_edges"],
        "dperm2all": dperm2all,
        "idx_perm": idx_perm,
        "r_cover": r_cover,
    }
    return ret