Esempio n. 1
0
    def apply(self, A=None, b=None):
        # first we need to know the boundary facets
        location_mask = self.fe_space.mesh.boundary(fnc=self.domain)

        # then we need the dofs that are associated with those entities
        dim = self.fe_space.mesh.dimension - 1
        dir_dof_mask = self.fe_space.extract_dofs(d=dim, mask=location_mask)
        dir_dof_ind = dir_dof_mask.nonzero()[0]

        # replace every row of a dirichlet dof
        # with a row that has the single value 1
        # in the corresponding column
        if not sparse.isspmatrix_lil(A):
            raise TypeError('Wrong sparse matrix format ({})'.format(A.format))
        
        dir_dof_rows = dir_dof_ind[:,None].tolist()
        dir_dof_data = [[1.]] * dir_dof_ind.size
        
        A.rows[dir_dof_ind] = dir_dof_rows
        A.data[dir_dof_ind] = dir_dof_data

        # set the corresponding load vector entries as the
        # l2 projection of the boundary function
        l2_proj = L2Projection.project(fnc=self.ud,
                                       fe_space=self.fe_space,
                                       codim=1,
                                       mask=None)

        b[dir_dof_ind] = l2_proj[dir_dof_ind, None]

        return A, b
Esempio n. 2
0
def threshold_to_zero(mx, threshold):
    """Set value in a sparse matrix lower than
     threshold to zero. 
    
    Return the 'coo' format sparse matrix.

    Parameters
    ----------
    mx : array_like
        Sparse matrix.
    threshold : float
        Threshold parameter.
    """
    high_values_indexes = set(zip(*((np.abs(mx) >= threshold).nonzero())))
    nonzero_indexes = zip(*(mx.nonzero()))

    if not sp.isspmatrix_lil(mx):
        mx = mx.tolil()

    for s in nonzero_indexes:
        if s not in high_values_indexes:
            mx[s] = 0.0
    mx = mx.tocoo()
    mx.eliminate_zeros()
    return mx
Esempio n. 3
0
def setdiag_range(mat,arr,ind=(),k=0):
  """
  Similar to instance method setdiag but with the option to specify a specific range along
  the diagonal. Currently only works with matrices of type lil_matrix.
  """
  if not sp.isspmatrix_lil(mat):
    raise ValueError('argument mat must be of type scipy.sparse.lil_matrix')
  if not isinstance(arr,(list,tuple)):
    raise ValueError('argument arr must be of type list or tuple')
  if not isinstance(ind,(list,tuple)):
    raise ValueError('argument ind must be of type list or tuple')

  if ind[0] == 0:
    mat.setdiag(arr[0:ind[1]],k)
  diag_size = min(mat.shape) - abs(k); do_fill = False
  if ind[1] < 0 and ind[0] < ind[1] and diag_size + ind[1] > 0:
    start_ind = max(diag_size + ind[0],0); end_ind = diag_size + ind[1]
    do_fill = True
  elif ind[0] > 0 and ind[1] >= ind[0] and ind[0] < diag_size:
    start_ind = ind[0]; end_ind = min(ind[1],diag_size - 1)
    do_fill = True
  if do_fill:
    j = 0
    if k == 0:
      for i in xrange(start_ind,end_ind + 1):
        if j < len(arr): mat[i,i] = arr[j]; j += 1
        else: break
    elif k > 0:
      for i in xrange(start_ind,end_ind + 1):
        if j < len(arr): mat[i,i + k] = arr[j]; j += 1
        else: break
Esempio n. 4
0
    def _slice(data, obs_selector=None, vars_selector=None):
        """
        Slice date using any selector that the AnnData object
        supprots for slicing.  If selector is None, will not slice
        on that axis.

        This method exists to optimize filtering/slicing sparse data that has
        access patterns which impact slicing performance.

        https://docs.scipy.org/doc/scipy/reference/sparse.html
        """
        prefer_row_access = (sparse.isspmatrix_csr(data._X)
                             or sparse.isspmatrix_lil(data._X)
                             or sparse.isspmatrix_bsr(data._X))
        if prefer_row_access:
            # Row-major slicing
            if obs_selector is not None:
                data = data[obs_selector, :]
            if vars_selector is not None:
                data = data[:, vars_selector]
        else:
            # Col-major slicing
            if vars_selector is not None:
                data = data[:, vars_selector]
            if obs_selector is not None:
                data = data[obs_selector, :]

        return data
Esempio n. 5
0
def sparse_remove_row(X, to_remove):

    if not sps.isspmatrix_lil(X):
        X = X.tolil()

    to_keep = [i for i in xrange(0, X.shape[0]) if i not in to_remove]
    Y = sps.vstack([X.getrowview(i) for i in to_keep])
    return Y
Esempio n. 6
0
def _set_weight_class(adata: AnnData, key: str) -> W:
    X = adata.obsp[key]
    if not isspmatrix_lil(X):
        X = X.tolil()

    neighbors = dict(enumerate(X.rows))
    weights = dict(enumerate(X.data))

    return libpysal.weights.W(neighbors, weights, ids=adata.obs.index.values)
Esempio n. 7
0
def abs_sparse(X):
    """ Element-wise absolute value of sparse matrix """
    X_abs = X.copy()
    if sparse.isspmatrix_csr(X) or sparse.isspmatrix_csc(X):
        X_abs.data = np.abs(X_abs.data)
    elif sparse.isspmatrix_lil(X):
        X_abs.data = np.array([np.abs(L) for L in X_abs.data])
    else:
        raise ValueError("Only supports CSR/CSC and LIL matrices")
    return X_abs
def abs_sparse(X):
  """ Element-wise absolute value of sparse matrix """
  X_abs = X.copy()
  if sparse.isspmatrix_csr(X) or sparse.isspmatrix_csc(X):
    X_abs.data = np.abs(X_abs.data)
  elif sparse.isspmatrix_lil(X):
    X_abs.data = np.array([np.abs(L) for L in X_abs.data])
  else:
    raise ValueError("Only supports CSR/CSC and LIL matrices")
  return X_abs
Esempio n. 9
0
def matrix_conflicts(L):
    """
    Given an N x M matrix where L_{i,j} is the label given by the jth LF to the ith candidate:
    Return the **fraction of candidates that each LF _conflicts with other LFs on_.**
    """
    B = L.copy()
    if not sparse.issparse(B):
        for row in range(B.shape[0]):
            if np.unique(np.array(B[row][np.nonzero(B[row])])).size == 1:
                B[row] = 0
        return matrix_coverage(sparse_nonzero(B))
    if not (sparse.isspmatrix_csc(B) or sparse.isspmatrix_lil(B) or sparse.isspmatrix_csr(B)):
        raise ValueError("Only supports CSR/CSC and LIL matrices")
    if sparse.isspmatrix_csc(B) or sparse.isspmatrix_lil(B):
        B = B.tocsr()
    for row in range(B.shape[0]):
        if np.unique(B.getrow(row).data).size == 1:
            B.data[B.indptr[row]:B.indptr[row+1]] = 0
    return matrix_coverage(sparse_nonzero(B))
Esempio n. 10
0
def sparse_matrix_report(m):
    print(repr(m))
    print('Number of non-zeros  :', m.nnz)
    print('Sparsity             :', 1 - m.nnz / (m.shape[0] * m.shape[1]))

    if isspmatrix_csr(m) or isspmatrix_csc(m):
        print('data length          : {} ({})'.format(len(m.data),
                                                      m.data.dtype))
        print('indptr length        : {} ({})'.format(len(m.indptr),
                                                      m.indptr.dtype))
        print('indices length       : {} ({})'.format(len(m.indices),
                                                      m.indices.dtype))
        print('Size                 :',
              size(m.data.nbytes + m.indptr.nbytes + m.indices.nbytes))
        print('10 x 10 preview:')
        print(m[:10, :10].toarray())
    elif isspmatrix_bsr(m):
        print('data length          : {} ({})'.format(len(m.data),
                                                      m.data.dtype))
        print('indptr length        : {} ({})'.format(len(m.indptr),
                                                      m.indptr.dtype))
        print('indices length       : {} ({})'.format(len(m.indices),
                                                      m.indices.dtype))
        print('blocksize length     : {}'.format(m.blocksize))
        print('Size                 :',
              size(m.data.nbytes + m.indptr.nbytes + m.indices.nbytes))
        print('preview:')
        print(m)
    elif isspmatrix_coo(m):
        print('data length          : {} ({})'.format(len(m.data),
                                                      m.data.dtype))
        print('row length           : {} ({})'.format(len(m.row), m.row.dtype))
        print('col length           : {} ({})'.format(len(m.col), m.col.dtype))
        print('Size                 :',
              size(m.data.nbytes + m.row.nbytes + m.col.nbytes))
        print('preview:')
        print(m)
    elif isspmatrix_dok(m):
        print('Size                 :', size(sys.getsizeof(m)))
        print('10 x 10 preview:')
        print(m[:10, :10].toarray())
    elif isspmatrix_dia(m):
        print('data length          : {} ({})'.format(len(m.data),
                                                      m.data.dtype))
        print('Offsets              : {} ({})'.format(len(m.offsets),
                                                      m.offsets.dtype))
        print('Size                 :', size(m.data.nbytes + m.offsets.nbytes))
        print('(no preview)')
    elif isspmatrix_lil(m):
        print('data length          : {} ({})'.format(len(m.data),
                                                      m.data.dtype))
        print('rows                 : {} ({})'.format(len(m.rows),
                                                      m.rows.dtype))
        print('Size                 :', size(m.data.nbytes + m.rows.nbytes))
        print('(no preview)')
Esempio n. 11
0
def _fix_connectivity(X, connectivity, affinity):
    """
    Fixes the connectivity matrix

        - copies it
        - makes it symmetric
        - converts it to LIL if necessary
        - completes it if necessary
    """
    n_samples = X.shape[0]
    if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
        raise ValueError(
            "Wrong shape for connectivity matrix: %s when X is %s"
            % (connectivity.shape, X.shape)
        )

    # Make the connectivity matrix symmetric:
    connectivity = connectivity + connectivity.T

    # Convert connectivity matrix to LIL
    if not sparse.isspmatrix_lil(connectivity):
        if not sparse.isspmatrix(connectivity):
            connectivity = sparse.lil_matrix(connectivity)
        else:
            connectivity = connectivity.tolil()

    # Compute the number of nodes
    n_connected_components, labels = connected_components(connectivity)

    if n_connected_components > 1:
        warnings.warn(
            "the number of connected components of the "
            "connectivity matrix is %d > 1. Completing it to avoid "
            "stopping the tree early." % n_connected_components,
            stacklevel=2,
        )
        # XXX: Can we do without completing the matrix?
        for i in range(n_connected_components):
            idx_i = np.where(labels == i)[0]
            Xi = X[idx_i]
            for j in range(i):
                idx_j = np.where(labels == j)[0]
                Xj = X[idx_j]
                if affinity == "precomputed":
                    D = X[np.ix_(idx_i, idx_j)]
                else:
                    D = pairwise_distances(Xi, Xj, metric=affinity)
                ii, jj = np.where(D == np.min(D))
                ii = ii[0]
                jj = jj[0]
                connectivity[idx_i[ii], idx_j[jj]] = True
                connectivity[idx_j[jj], idx_i[ii]] = True

    return connectivity, n_connected_components
Esempio n. 12
0
def sparse_nonzero(X):
    """Sparse matrix with value 1 for i,jth entry !=0"""
    X_nonzero = X.copy()
    if not sparse.issparse(X):
        X_nonzero[X_nonzero != 0] = 1
        return X_nonzero
    if sparse.isspmatrix_csr(X) or sparse.isspmatrix_csc(X):
        X_nonzero.data[X_nonzero.data != 0] = 1
    elif sparse.isspmatrix_lil(X):
        X_nonzero.data = [np.ones(len(L)) for L in X_nonzero.data]
    else:
        raise ValueError("Only supports CSR/CSC and LIL matrices")
    return X_nonzero
Esempio n. 13
0
def sparse_nonzero(X):
    """Sparse matrix with value 1 for i,jth entry !=0"""
    X_nonzero = X.copy()
    if not sparse.issparse(X):
        X_nonzero[X_nonzero != 0] = 1
        return X_nonzero
    if sparse.isspmatrix_csr(X) or sparse.isspmatrix_csc(X):
        X_nonzero.data[X_nonzero.data != 0] = 1
    elif sparse.isspmatrix_lil(X):
        X_nonzero.data = [np.ones(len(L)) for L in X_nonzero.data]
    else:
        raise ValueError("Only supports CSR/CSC and LIL matrices")
    return X_nonzero
Esempio n. 14
0
    def fit(self, k=100, max_iter=15):
        if self.recommender_data.preference_matrix.shape[1] < k:
            k = self.recommender_data.preference_matrix.shape[1]

        if not spMat.isspmatrix_lil(self.recommender_data.preference_matrix):
            self.recommender_data.preference_matrix = spMat.lil_matrix(self.recommender_data.preference_matrix)
        self.perp = None

        model = LDA_CVB0(self.recommender_data.preference_matrix, K=k)
        model.lda_learning(max_iter)

        self.user_matrix = model.documentdist()
        self.item_matrix = model.worddist()
Esempio n. 15
0
def _fix_connectivity(X, connectivity, n_components=None,
                      affinity="euclidean"):
    """
    Fixes the connectivity matrix

        - copies it
        - makes it symmetric
        - converts it to LIL if necessary
        - completes it if necessary
    """
    n_samples = X.shape[0]
    if (connectivity.shape[0] != n_samples or
        connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # Make the connectivity matrix symmetric:
    connectivity = connectivity + connectivity.T

    # Convert connectivity matrix to LIL
    if not sparse.isspmatrix_lil(connectivity):
        if not sparse.isspmatrix(connectivity):
            connectivity = sparse.lil_matrix(connectivity)
        else:
            connectivity = connectivity.tolil()

    # Compute the number of nodes
    n_components, labels = connected_components(connectivity)

    if n_components > 1:
        warnings.warn("the number of connected components of the "
                      "connectivity matrix is %d > 1. Completing it to avoid "
                      "stopping the tree early." % n_components,
                      stacklevel=2)
        # XXX: Can we do without completing the matrix?
        for i in xrange(n_components):
            idx_i = np.where(labels == i)[0]
            Xi = X[idx_i]
            for j in xrange(i):
                idx_j = np.where(labels == j)[0]
                Xj = X[idx_j]
                D = pairwise_distances(Xi, Xj, metric=affinity)
                ii, jj = np.where(D == np.min(D))
                ii = ii[0]
                jj = jj[0]
                connectivity[idx_i[ii], idx_j[jj]] = True
                connectivity[idx_j[jj], idx_i[ii]] = True
        n_components = 1

    return connectivity
Esempio n. 16
0
def should_enforce_sparse(m,
                          sparse_format: SparseFormat,
                          policy: SparsePolicy,
                          dtype,
                          sparse_values: bool = True) -> bool:
    """
    Returns whether it is preferable to convert a given matrix into a `scipy.sparse.csr_matrix`,
    `scipy.sparse.csc_matrix` or `scipy.sparse.dok_matrix`, depending on the format of the given matrix and a given
    `SparsePolicy`:

    If the given policy is `SparsePolicy.AUTO`, the matrix will be converted into the given sparse format, if possible,
    if the sparse matrix is expected to occupy less memory than a dense matrix. To be able to convert the matrix into a
    sparse format, it must be a `scipy.sparse.lil_matrix`, `scipy.sparse.dok_matrix` or `scipy.sparse.coo_matrix`. If
    the given sparse format is `csr` or `csc` and the matrix is a already in that format, it will not be converted.

    If the given policy is `SparsePolicy.FORCE_DENSE`, the matrix will always be converted into the specified sparse
    format, if possible.

    If the given policy is `SparsePolicy.FORCE_SPARSE`, the matrix will always be converted into a dense matrix.

    :param m:               A `np.ndarray` or `scipy.sparse.matrix` to be checked
    :param sparse_format:   The `SparseFormat` to be used
    :param policy:          The `SparsePolicy` to be used
    :param dtype:           The type of the values that should be stored in the matrix
    :param sparse_values:   True, if the values must explicitly be stored when using a sparse format, False otherwise
    :return:                True, if it is preferable to convert the matrix into a sparse matrix of the given format,
                            False otherwise
    """
    if not issparse(m):
        # Given matrix is dense
        if policy != SparsePolicy.FORCE_SPARSE:
            return False
    elif (isspmatrix_csr(m) and sparse_format == SparseFormat.CSR) or (
            isspmatrix_csc(m) and sparse_format == SparseFormat.CSC):
        # Matrix is a `scipy.sparse.csr_matrix` or `scipy.sparse.csc_matrix` and is already in the given sparse format
        return policy != SparsePolicy.FORCE_DENSE
    elif isspmatrix_lil(m) or isspmatrix_coo(m) or isspmatrix_dok(m):
        # Given matrix is in a format that might be converted into the specified sparse format
        if policy == SparsePolicy.AUTO:
            return is_sparse(m,
                             sparse_format=sparse_format,
                             dtype=dtype,
                             sparse_values=sparse_values)
        else:
            return policy == SparsePolicy.FORCE_SPARSE

    raise ValueError('Matrix of type ' + type(m).__name__ +
                     ' cannot be converted to format "' + str(sparse_format) +
                     '""')
Esempio n. 17
0
def threshold_to_zero(mx, threshold):
    """Set value in a sparse matrix lower than
     threshold to zero.
    """
    high_values_indexes = set(zip(*((np.abs(mx) >= threshold).nonzero())))
    nonzero_indexes = zip(*(mx.nonzero()))

    if not sp.isspmatrix_lil(mx):
        mx = mx.tolil()

    for s in nonzero_indexes:
        if s not in high_values_indexes:
            mx[s] = 0.0
    mx = mx.tocoo()
    mx.eliminate_zeros()
    return mx
Esempio n. 18
0
def setdiag_range(mat, arr, ind=(), k=0):
    """
    Similar to instance method setdiag but with the option to
    specify a specific range along the diagonal. Currently only
    works with matrices of type lil_matrix.
    """

    if not sp.isspmatrix_lil(mat):
        raise ValueError(
            'argument mat must be of type scipy.sparse.lil_matrix')
    if not isinstance(arr, (list, tuple)):
        raise ValueError('argument arr must be of type list or tuple')
    if not isinstance(ind, (list, tuple)):
        raise ValueError('argument ind must be of type list or tuple')

    if ind[0] == 0:
        mat.setdiag(arr[0:ind[1]], k)
    diag_size = min(mat.shape) - abs(k)
    do_fill = False

    if ind[1] < 0 and ind[0] < ind[1] and diag_size + ind[1] > 0:
        start_ind = max(diag_size + ind[0], 0)
        end_ind = diag_size + ind[1]
        do_fill = True
    elif ind[0] > 0 and ind[1] >= ind[0] and ind[0] < diag_size:
        start_ind = ind[0]
        end_ind = min(ind[1], diag_size - 1)
        do_fill = True

    if do_fill:
        j = 0
        if k == 0:
            for i in range(start_ind, end_ind + 1):
                if j < len(arr):
                    mat[i, i] = arr[j]
                    j += 1
                else:
                    break
        elif k > 0:
            for i in range(start_ind, end_ind + 1):
                if j < len(arr):
                    mat[i, i + k] = arr[j]
                    j += 1
                else:
                    break
def sparse_remove_row(X, to_remove):
    """ Delete rows from a sparse matrix

    Parameters
    ----------
    X : scipy.sparse matrix
    to_remove : a list of row indices to be removed.

    Returns
    -------
    Y : scipy.sparse matrix
    """
    if not sps.isspmatrix_lil(X):
        X = X.tolil()

    to_keep = [i for i in xrange(0, X.shape[0]) if i not in to_remove]
    Y = sps.vstack([X.getrowview(i) for i in to_keep])
    return Y
Esempio n. 20
0
def sparse_remove_row(X, to_remove):
    """ Delete rows from a sparse matrix

    Parameters
    ----------
    X : scipy.sparse matrix
    to_remove : a list of row indices to be removed.

    Returns
    -------
    Y : scipy.sparse matrix
    """
    if not sps.isspmatrix_lil(X):
        X = X.tolil()

    to_keep = [i for i in range(0, X.shape[0]) if i not in to_remove]
    Y = sps.vstack([X.getrowview(i) for i in to_keep])
    return Y
Esempio n. 21
0
def _ispmatrix_all(matrix):
    """ Iterator for iterating rows and columns for non-zero elements in a `scipy.sparse.*_matrix` (or `SparseCSR`)

    Parameters
    ----------
    matrix : scipy.sparse.sp_matrix
      the sparse matrix to iterate non-zero elements

    Yields
    ------
    int, int
       the row, column indices of the non-zero elements
    """
    if isspmatrix_csr(matrix):
        for r in range(matrix.shape[0]):
            for ind in range(matrix.indptr[r], matrix.indptr[r + 1]):
                yield r, matrix.indices[ind]

    elif isspmatrix_lil(matrix):
        for r in range(matrix.shape[0]):
            for c in matrix.rows[r]:
                yield r, c

    elif isspmatrix_coo(matrix):
        for r, c in zip(matrix.row, matrix.col):
            yield r, c

    elif isspmatrix_csc(matrix):
        for c in range(matrix.shape[1]):
            for ind in range(matrix.indptr[c], matrix.indptr[c + 1]):
                yield matrix.indices[ind], c

    elif isinstance(matrix, SparseCSR):
        for r in range(matrix.shape[0]):
            n = matrix.ncol[r]
            ptr = matrix.ptr[r]
            for c in matrix.col[ptr:ptr + n]:
                yield r, c

    else:
        raise NotImplementedError(
            "The iterator for this sparse matrix has not been implemented")
Esempio n. 22
0
def spiter(matrix):
    """
    Iterator for iterating the elements in a ``scipy.sparse.*_matrix``

    This will always return:
    >>> (row, column, matrix-element)

    Currently this can iterate `coo`, `csc`, `lil` and `csr`, others may easily be added.

    Parameters
    ----------
    matrix : ``scipy.sparse.sp_matrix``
      the sparse matrix to iterate non-zero elements

    References
    ----------
    By stackoverflow user zeroth on https://stackoverflow.com/a/42625707
    """
    if isspmatrix_coo(matrix):
        for r, c, m in zip(matrix.row, matrix.col, matrix.data):
            yield r, c, m

    elif isspmatrix_csc(matrix):
        for c in range(matrix.shape[1]):
            for ind in range(matrix.indptr[c], matrix.indptr[c+1]):
                yield matrix.indices[ind], c, matrix.data[ind]

    elif isspmatrix_csr(matrix):
        for r in range(matrix.shape[0]):
            for ind in range(matrix.indptr[r], matrix.indptr[r+1]):
                yield r, matrix.indices[ind], matrix.data[ind]

    elif isspmatrix_lil(matrix):
        for r in range(matrix.shape[0]):
            for c, d in zip(matrix.rows[r], matrix.data[r]):
                yield r, c, d

    else:
        raise NotImplementedError("The iterator for this sparse matrix has not been implemented")
Esempio n. 23
0
def ward_tree(X, connectivity=None, n_components=None, copy=True):
    """Ward clustering based on a Feature matrix.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account a some topological
    structure between samples.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        feature matrix  representing n_samples samples to be clustered

    connectivity : sparse matrix.
        connectivity matrix. Defines for each sample the neigbhoring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.

    copy : bool (optional)
        Make a copy of connectivity or work inplace. If connectivity
        is not of LIL type there will be a copy in any case.

    Returns
    -------
    children : list of pairs. Lenght of n_nodes
               list of the children of each nodes.
               Leaves of the tree have empty list of children.

    n_components : sparse matrix.
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree
    """
    X = np.asarray(X)
    n_samples, n_features = X.shape
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))

    if connectivity is None:
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.int)
        return children_, 1, n_samples

    # Compute the number of nodes
    if n_components is None:
        n_components, labels = cs_graph_components(connectivity)

    # Convert connectivity matrix to LIL with a copy if needed
    if sparse.isspmatrix_lil(connectivity) and copy:
        connectivity = connectivity.copy()
    else:
        connectivity = connectivity.tolil()

    if n_components > 1:
        warnings.warn("the number of connected components of the"
        " connectivity matrix is %d > 1. Completing it to avoid"
        " stopping the tree early."
        % n_components)
        connectivity = _fix_connectivity(X, connectivity,
                                            n_components, labels)
        n_components = 1

    n_nodes = 2 * n_samples - n_components

    if (connectivity.shape[0] != n_samples or
        connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # Remove diagonal from connectivity matrix
    connectivity.setdiag(np.zeros(connectivity.shape[0]))

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.int)
    coord_col = np.array(coord_col, dtype=np.int)

    # build moments as a list
    moments_1 = np.zeros(n_nodes)
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features))
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float)
    _hierarchical.compute_ward_dist(moments_1, moments_2,
                             coord_row, coord_col, inertia)
    inertia = zip(inertia, coord_row, coord_col)
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.int)
    heights = np.zeros(n_nodes)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []

    visited = np.empty(n_nodes, dtype=bool)

    # recursive merge loop
    for k in xrange(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j], heights[k] = k, k, inert
        children.append([i, j])
        used_node[i] = used_node[j] = False

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        visited[:] = False
        visited[k] = True
        for l in set(A[i]).union(A[j]):
            l = _hierarchical._get_parent(l, parent)
            if not visited[l]:
                visited[l] = True
                coord_col.append(l)
                A[l].append(k)
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.int)
        coord_row = np.empty_like(coord_col)
        coord_row.fill(k)
        ini = np.empty(len(coord_row), dtype=np.float)

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                   coord_row, coord_col, ini)
        for tupl in itertools.izip(ini, coord_row, coord_col):
            heappush(inertia, tupl)

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    children = np.array(children)  # return numpy array for efficient caching

    return children, n_components, n_leaves
Esempio n. 24
0
def plot_activations_density(z_hat,
                             n_times_atom,
                             sfreq=1.,
                             threshold=0.01,
                             bandwidth='auto',
                             axes=None,
                             t_min=0,
                             plot_activations=False,
                             colors=None):
    """
    Parameters
    ----------
    z_hat : array, shape (n_atoms, n_trials, n_times_valid)
        The sparse activation matrix.
    n_times_atom : int
        The support of the atom.
    sfreq : float
        Sampling frequency
    threshold : float
        Remove activations (normalized with the max) below this threshold
    bandwidth : float, array of float, or 'auto'
        Bandwidth (in sec) of the kernel
    axes : array of axes, or None
        Axes to plot into
    t_min : float
        Time offset for the xlabel display
    plot_activations : boolean
        If True, the significant activations are plotted as black dots
    colors : list of matplotlib compatible colors
        Colors of the plots
    """
    if sparse.isspmatrix_lil(z_hat[0]):
        z_hat = np.array([z.toarray() for z in z_hat])

    n_atoms, n_trials, n_times_valid = z_hat.shape

    # sum activations over all trials
    z_hat_sum = z_hat.sum(axis=0)

    if bandwidth == 'auto':
        bandwidth = n_times_atom

    if axes is None:
        fig, axes = plt.subplots(n_atoms,
                                 num='density',
                                 figsize=(8, 2 + n_atoms * 3))
    axes = np.atleast_1d(axes)

    if colors is None:
        colors = itertools.cycle(COLORS)
    for ax, activations, color in zip(axes.ravel(), z_hat_sum, colors):
        ax.clear()
        time_instants = np.arange(n_times_valid) / float(sfreq) + t_min
        selection = activations > threshold * z_hat_sum.max()
        n_elements = selection.sum()

        if n_elements == 0:
            ax.plot(time_instants, np.zeros_like(time_instants))
            continue

        # plot the activations as black dots
        if plot_activations:
            ax.plot(time_instants[selection],
                    activations[selection] / activations[selection].max(),
                    '.',
                    color='k')

        window = np.blackman(bandwidth)
        smooth_activations = np.convolve(activations, window, 'same')
        ax.fill_between(time_instants,
                        smooth_activations,
                        color=color,
                        alpha=0.5)

    return axes
Esempio n. 25
0
def transform(dayuserwords, userregionmap, ndays):
    """
		userwords - a matrix containing a user per row grouped by dayuserwords
		userregionmap - a dictionary of user index to region
		ndays - number of days in userwords

		returns:
			regiondayuserword - sparse matrix containing users grouped by days grouped by region
			regiondayworduser - sparse matrix containing words grouped by days grouped by region
	"""
    regionusermap = dict([(x - 1, []) for x in set(userregionmap.values())])
    for user, region in userregionmap.items():
        regionusermap[region - 1] += [user]

    N = ndays
    U = dayuserwords.shape[0] / N
    W = dayuserwords.shape[1]
    R = len(regionusermap)
    missing_users = array(list(set(range(U)) - set(userregionmap.keys())))

    logger.debug("Preparing Output Matrices")
    regiondayuserword = None
    regiondayworduser = None

    # dayuserwords_r = ssp.csr_matrix(dayuserwords)
    if not ssp.isspmatrix_lil(dayuserwords):
        logger.debug("The data array must be lil, transforming...")
        dayuserwords = dayuserwords.tolil()

    logger.debug("Filling (R x D x U, W) matrix")
    rows = []
    data = []
    for r in range(R):
        logger.debug("Starting region: %d" % r)
        rusers = set(regionusermap[r])
        for n in range(N):
            logger.debug("Starting day: %d" % n)

            for u in range(U):
                if u not in rusers:
                    rows += [[]]
                    data += [[]]
                else:
                    i = n * U + u
                    rows += [dayuserwords.rows[i]]
                    data += [dayuserwords.data[i]]

    regiondayuserword = ssp.lil_matrix((1, 1), dtype=dayuserwords.dtype)
    regiondayuserword.data = data
    regiondayuserword.rows = rows
    regiondayuserword._shape = (R * N * U, W)
    logger.debug("... cleaning up Filling (R x D x U, W) matrix")
    regiondayuserword = ssp.csr_matrix(regiondayuserword)

    logger.debug("Filling (R x D x W, U) matrix")
    rduw_x = regiondayuserword[:U, :]
    regiondayworduser = ssp.coo_matrix(rduw_x.T)
    for x in xrange(1, R * N):
        rduw_x = regiondayuserword[x * U:(x + 1) * U, :]
        regiondayworduser = ssp.vstack((regiondayworduser, rduw_x.T))
    logger.debug("... cleaning up Filling (R x D x W, U) matrix")
    regiondayworduser = ssp.csr_matrix(regiondayworduser)
    return regiondayuserword, regiondayworduser
Esempio n. 26
0
def ispmatrixd(matrix, map_row=None, map_col=None):
    """ Iterator for iterating rows, columns and data for non-zero elements in a `scipy.sparse.*_matrix` (or `SparseCSR`)

    Parameters
    ----------
    matrix : scipy.sparse.sp_matrix
      the sparse matrix to iterate non-zero elements
    map_row : func, optional
      map each row entry through the function `map_row`, defaults to `None` which is 
      equivalent to no mapping.
    map_col : func, optional
      map each column entry through the function `map_col`, defaults to `None` which is 
      equivalent to no mapping.

    Yields
    ------
    int, int, <>
       the row, column and data of the non-zero elements
    """
    if map_row is None:
        map_row = lambda x: x
    if map_col is None:
        map_col = lambda x: x

    # Consider using the numpy nditer function for buffered iterations
    #it = np.nditer([geom.o2a(tmp.row), geom.o2a(tmp.col % geom.no), tmp.data],
    #               flags=['buffered'], op_flags=['readonly'])

    if isspmatrix_csr(matrix):
        for r in range(matrix.shape[0]):
            rr = map_row(r)
            for ind in range(matrix.indptr[r], matrix.indptr[r + 1]):
                yield rr, map_col(matrix.indices[ind]), matrix.data[ind]

    elif isspmatrix_lil(matrix):
        for r in range(matrix.shape[0]):
            rr = map_row(r)
            for c, m in zip(map_col(matrix.rows[r]), matrix.data[r]):
                yield rr, c, m

    elif isspmatrix_coo(matrix):
        for r, c, m in zip(map_row(matrix.row), map_col(matrix.col),
                           matrix.data):
            yield r, c, m

    elif isspmatrix_csc(matrix):
        for c in range(matrix.shape[1]):
            cc = map_col(c)
            for ind in range(matrix.indptr[c], matrix.indptr[c + 1]):
                yield map_row(matrix.indices[ind]), cc, matrix.data[ind]

    elif isinstance(matrix, SparseCSR):
        for r in range(matrix.shape[0]):
            rr = map_row(r)
            n = matrix.ncol[r]
            if n == 0:
                continue
            ptr = matrix.ptr[r]
            sl = slice(ptr, ptr + n, None)
            for c, d in zip(map_col(matrix.col[sl]), matrix._D[sl, :]):
                yield rr, c, d

    else:
        raise NotImplementedError(
            "The iterator for this sparse matrix has not been implemented")
Esempio n. 27
0
def _fix_connectivity(X, connectivity, affinity):
    """
    Fixes the connectivity matrix.

    The different steps are:

    - copies it
    - makes it symmetric
    - converts it to LIL if necessary
    - completes it if necessary.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Feature matrix representing `n_samples` samples to be clustered.

    connectivity : sparse matrix, default=None
        Connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is `None`, i.e, the Ward algorithm is unstructured.

    affinity : {"euclidean", "precomputed"}, default="euclidean"
        Which affinity to use. At the moment `precomputed` and
        ``euclidean`` are supported. `euclidean` uses the
        negative squared Euclidean distance between points.

    Returns
    -------
    connectivity : sparse matrix
        The fixed connectivity matrix.

    n_connected_components : int
        The number of connected components in the graph.
    """
    n_samples = X.shape[0]
    if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
        raise ValueError(
            "Wrong shape for connectivity matrix: %s when X is %s"
            % (connectivity.shape, X.shape)
        )

    # Make the connectivity matrix symmetric:
    connectivity = connectivity + connectivity.T

    # Convert connectivity matrix to LIL
    if not sparse.isspmatrix_lil(connectivity):
        if not sparse.isspmatrix(connectivity):
            connectivity = sparse.lil_matrix(connectivity)
        else:
            connectivity = connectivity.tolil()

    # Compute the number of nodes
    n_connected_components, labels = connected_components(connectivity)

    if n_connected_components > 1:
        warnings.warn(
            "the number of connected components of the "
            "connectivity matrix is %d > 1. Completing it to avoid "
            "stopping the tree early." % n_connected_components,
            stacklevel=2,
        )
        # XXX: Can we do without completing the matrix?
        connectivity = _fix_connected_components(
            X=X,
            graph=connectivity,
            n_connected_components=n_connected_components,
            component_labels=labels,
            metric=affinity,
            mode="connectivity",
        )

    return connectivity, n_connected_components
Esempio n. 28
0
def _update_z_multi_idx(X_i,
                        D,
                        reg,
                        z0_i,
                        debug,
                        solver='l-bfgs',
                        solver_kwargs=dict(),
                        freeze_support=False,
                        loss='l2',
                        loss_params=dict(),
                        timing=False):
    t_start = time.time()
    n_channels, n_times = X_i.shape
    if D.ndim == 2:
        n_atoms, n_channels_n_times_atom = D.shape
        n_times_atom = n_channels_n_times_atom - n_channels
    else:
        n_atoms, n_channels, n_times_atom = D.shape
    n_times_valid = n_times - n_times_atom + 1

    assert not (freeze_support and z0_i is None), 'Impossible !'

    if is_lil(z0_i) and solver != "lgcd":
        raise NotImplementedError()

    constants = {}
    if solver == "lgcd":
        constants['DtD'] = compute_DtD(D=D, n_channels=n_channels)
    init_timing = time.time() - t_start

    def func_and_grad(zi):
        return gradient_zi(Xi=X_i,
                           zi=zi,
                           D=D,
                           constants=constants,
                           reg=reg,
                           return_func=True,
                           flatten=True,
                           loss=loss,
                           loss_params=loss_params)

    if z0_i is None:
        f0 = np.zeros(n_atoms * n_times_valid)
    elif is_lil(z0_i):
        f0 = z0_i
    else:
        f0 = z0_i.reshape(n_atoms * n_times_valid)

    times, pobj = None, None
    if timing:
        times = [init_timing]
        pobj = [func_and_grad(f0)[0]]
        t_start = [time.time()]

    if solver == 'l-bfgs':
        if freeze_support:
            bounds = [(0, 0) if z == 0 else (0, None) for z in f0]
        else:
            bounds = BoundGenerator(n_atoms * n_times_valid)
        if timing:

            def callback(xk):
                times.append(time.time() - t_start[0])
                pobj.append(func_and_grad(xk)[0])
                # use a reference to have access inside this function
                t_start[0] = time.time()
        else:
            callback = None
        factr = solver_kwargs.get('factr', 1e15)  # default value
        maxiter = solver_kwargs.get('maxiter', 15000)  # default value
        z_hat, f, d = optimize.fmin_l_bfgs_b(func_and_grad,
                                             f0,
                                             fprime=None,
                                             args=(),
                                             approx_grad=False,
                                             bounds=bounds,
                                             factr=factr,
                                             maxiter=maxiter,
                                             callback=callback)

    elif solver in ("ista", "fista"):
        # Default args
        fista_kwargs = dict(max_iter=100,
                            eps=None,
                            verbose=0,
                            restart=None,
                            scipy_line_search=False,
                            momentum=(solver == "fista"))
        fista_kwargs.update(solver_kwargs)

        def objective(z_hat):
            return func_and_grad(z_hat)[0]

        def grad(z_hat):
            return func_and_grad(z_hat)[1]

        def prox(z_hat, ):
            return np.maximum(z_hat, 0.)

        output = fista(objective,
                       grad,
                       prox,
                       None,
                       f0,
                       adaptive_step_size=True,
                       timing=timing,
                       name="Update z",
                       **fista_kwargs)
        if timing:
            z_hat, pobj, times = output
            times[0] += init_timing
        else:
            z_hat, pobj = output

    elif solver == "lgcd":
        if not sparse.isspmatrix_lil(f0):
            f0 = f0.reshape(n_atoms, n_times_valid)

        # Default values
        tol = solver_kwargs.get('tol', 1e-1)
        n_seg = solver_kwargs.get('n_seg', 'auto')
        max_iter = solver_kwargs.get('max_iter', 1e15)
        strategy = solver_kwargs.get('strategy', 'greedy')
        output = _coordinate_descent_idx(X_i,
                                         D,
                                         constants,
                                         reg=reg,
                                         z0=f0,
                                         freeze_support=freeze_support,
                                         tol=tol,
                                         max_iter=max_iter,
                                         n_seg=n_seg,
                                         strategy=strategy,
                                         timing=timing,
                                         name="Update z")
        if timing:
            z_hat, pobj, times = output
            times[0] += init_timing
        else:
            z_hat = output
    else:
        raise ValueError("Unrecognized solver %s. Must be 'ista', 'fista',"
                         " or 'l-bfgs'." % solver)

    if not is_lil(z_hat):
        z_hat = z_hat.reshape(n_atoms, n_times_valid)

    if loss == 'l2':
        if not is_lil(z_hat):
            ztz = compute_ztz(z_hat[None], n_times_atom)
            ztX = compute_ztX(z_hat[None], X_i[None])
        else:
            cython_code._assert_cython()
            ztz = cython_code._fast_compute_ztz([z_hat], n_times_atom)
            ztX = cython_code._fast_compute_ztX([z_hat], X_i[None])
    else:
        ztz, ztX = None, None

    return z_hat, ztz, ztX, pobj, times
Esempio n. 29
0
def ispmatrix(matrix, map_row=None, map_col=None):
    """ Iterator for iterating rows and columns for non-zero elements in a `scipy.sparse.*_matrix` (or `SparseCSR`)

    If either `map_row` or `map_col` are not None the generator will only yield
    the unique values.

    Parameters
    ----------
    matrix : scipy.sparse.sp_matrix
      the sparse matrix to iterate non-zero elements
    map_row : func, optional
      map each row entry through the function `map_row`, defaults to `None` which is 
      equivalent to no mapping.
    map_col : func, optional
      map each column entry through the function `map_col`, defaults to `None` which is 
      equivalent to no mapping.

    Yields
    ------
    int, int
       the row, column indices of the non-zero elements
    """

    if map_row is None and map_col is None:
        # Skip unique checks
        for r, c in _ispmatrix_all(matrix):
            yield r, c
        return

    if map_row is None:
        map_row = lambda x: x
    if map_col is None:
        map_col = lambda x: x
    map_row = np.vectorize(map_row)
    map_col = np.vectorize(map_col)

    nrow = len(unique(map_row(arange(matrix.shape[0], dtype=np.int32))))
    ncol = len(unique(map_col(arange(matrix.shape[1], dtype=np.int32))))
    rows = zeros(nrow, dtype=np.bool_)
    cols = zeros(ncol, dtype=np.bool_)

    # Initialize the unique arrays
    rows[:] = False

    # Consider using the numpy nditer function for buffered iterations
    #it = np.nditer([geom.o2a(tmp.row), geom.o2a(tmp.col % geom.no), tmp.data],
    #               flags=['buffered'], op_flags=['readonly'])

    if isspmatrix_csr(matrix):
        for r in range(matrix.shape[0]):
            rr = map_row(r)
            if rows[rr]: continue
            rows[rr] = True
            cols[:] = False
            for ind in range(matrix.indptr[r], matrix.indptr[r + 1]):
                c = map_col(matrix.indices[ind])
                if cols[c]: continue
                cols[c] = True
                yield rr, c

    elif isspmatrix_lil(matrix):
        for r in range(matrix.shape[0]):
            rr = map_row(r)
            if rows[rr]: continue
            rows[rr] = True
            cols[:] = False
            if len(matrix.rows[r]) == 0:
                continue
            for c in map_col(matrix.rows[r]):
                if cols[c]: continue
                cols[c] = True
                yield rr, c

    elif isspmatrix_coo(matrix):
        raise ValueError(
            "mapping and unique returns are not implemented for COO matrix")

    elif isspmatrix_csc(matrix):
        raise ValueError(
            "mapping and unique returns are not implemented for CSC matrix")

    elif isinstance(matrix, SparseCSR):
        for r in range(matrix.shape[0]):
            rr = map_row(r)
            if rows[rr]: continue
            rows[rr] = True
            cols[:] = False
            n = matrix.ncol[r]
            if n == 0:
                continue
            ptr = matrix.ptr[r]
            for c in map_col(matrix.col[ptr:ptr + n]):
                if cols[c]: continue
                cols[c] = True
                yield rr, c

    else:
        raise NotImplementedError(
            "The iterator for this sparse matrix has not been implemented")
Esempio n. 30
0
def transform(dayuserwords, userregionmap, ndays):
	"""
		userwords - a matrix containing a user per row grouped by dayuserwords
		userregionmap - a dictionary of user index to region
		ndays - number of days in userwords

		returns:
			regiondayuserword - sparse matrix containing users grouped by days grouped by region
			regiondayworduser - sparse matrix containing words grouped by days grouped by region
	"""
	regionusermap = dict([(x-1,[]) for x in set(userregionmap.values())])
	for user,region in userregionmap.items():
		regionusermap[region-1] += [user]

	N = ndays
	U = dayuserwords.shape[0]/N
	W = dayuserwords.shape[1]
	R = len(regionusermap)
	missing_users = array(list(set(range(U)) - set(userregionmap.keys())))

	logger.debug("Preparing Output Matrices")
	regiondayuserword = None
	regiondayworduser = None

	# dayuserwords_r = ssp.csr_matrix(dayuserwords)
	if not ssp.isspmatrix_lil(dayuserwords):
		logger.debug("The data array must be lil, transforming...")
		dayuserwords = dayuserwords.tolil()

	
	logger.debug("Filling (R x D x U, W) matrix")
	rows = []
	data = []
	for r in range(R):
		logger.debug("Starting region: %d"%r)
		rusers = set(regionusermap[r])
		for n in range(N):
			logger.debug("Starting day: %d"%n)

			for u in range(U):
				if u not in rusers: 
					rows += [[]]
					data += [[]]
				else:
					i = n * U + u
					rows += [dayuserwords.rows[i]]
					data += [dayuserwords.data[i]]
	
	regiondayuserword = ssp.lil_matrix((1,1),dtype=dayuserwords.dtype)
	regiondayuserword.data = data
	regiondayuserword.rows = rows
	regiondayuserword._shape = (R * N * U, W)
	logger.debug("... cleaning up Filling (R x D x U, W) matrix")
	regiondayuserword = ssp.csr_matrix(regiondayuserword)
	
	logger.debug("Filling (R x D x W, U) matrix")
	rduw_x = regiondayuserword[:U,:]
	regiondayworduser = ssp.coo_matrix(rduw_x.T)
	for x in xrange(1, R * N):
		rduw_x = regiondayuserword[x*U:(x+1)*U,:]
		regiondayworduser = ssp.vstack(
			(regiondayworduser, rduw_x.T)
		)
	logger.debug("... cleaning up Filling (R x D x W, U) matrix")
	regiondayworduser = ssp.csr_matrix(regiondayworduser)
	return regiondayuserword,regiondayworduser
	
Esempio n. 31
0
def ward_tree(X, connectivity=None, n_components=None, copy=True,
              n_clusters=None):
    """Ward clustering based on a Feature matrix.

    Recursively merges the pair of clusters that minimally increases
    within-cluster variance.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        feature matrix  representing n_samples samples to be clustered

    connectivity : sparse matrix.
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.

    copy : bool (optional)
        Make a copy of connectivity or work inplace. If connectivity
        is not of LIL type there will be a copy in any case.

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    Returns
    -------
    children : 2D array, shape (n_nodes, 2)
        The children of each non-leaf node. Values less than `n_samples` refer
        to leaves of the tree. A greater value `i` indicates a node with
        children `children[i - n_samples]`.

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    if connectivity is None:
        if n_clusters is not None:
            warnings.warn('Early stopping is implemented only for '
                          'structured Ward clustering (i.e. with '
                          'explicit connectivity.', stacklevel=2)
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.int)
        return children_, 1, n_samples, None

    # Compute the number of nodes
    if n_components is None:
        n_components, labels = cs_graph_components(connectivity)

    # Convert connectivity matrix to LIL with a copy if needed
    if sparse.isspmatrix_lil(connectivity) and copy:
        connectivity = connectivity.copy()
    elif not sparse.isspmatrix(connectivity):
        connectivity = sparse.lil_matrix(connectivity)
    else:
        connectivity = connectivity.tolil()

    if n_components > 1:
        warnings.warn("the number of connected components of the "
                      "connectivity matrix is %d > 1. Completing it to avoid "
                      "stopping the tree early." % n_components)
        connectivity = _fix_connectivity(X, connectivity, n_components, labels)
        n_components = 1

    if n_clusters is None:
        n_nodes = 2 * n_samples - n_components
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    if (connectivity.shape[0] != n_samples
            or connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.int)
    coord_col = np.array(coord_col, dtype=np.int)

    # build moments as a list
    moments_1 = np.zeros(n_nodes)
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features))
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float)
    _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col,
                                    inertia)
    inertia = list(six.moves.zip(inertia, coord_row, coord_col))
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.int)
    heights = np.zeros(n_nodes)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []

    not_visited = np.empty(n_nodes, dtype=np.int8)

    # recursive merge loop
    for k in range(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j], heights[k] = k, k, inert
        children.append([i, j])
        used_node[i] = used_node[j] = False

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        not_visited.fill(1)
        not_visited[k] = 0
        _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
        _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
        # List comprehension is faster than a for loop
        [A[l].append(k) for l in coord_col]
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.int)
        coord_row = np.empty_like(coord_col)
        coord_row.fill(k)
        n_additions = len(coord_row)
        ini = np.empty(n_additions, dtype=np.float)

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                        coord_row, coord_col, ini)
        # List comprehension is faster than a for loop
        [heappush(inertia, (ini[idx], k, coord_col[idx]))
            for idx in range(n_additions)]

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    children = np.array(children)  # return numpy array for efficient caching

    return children, n_components, n_leaves, parent
Esempio n. 32
0
    def add(self, other, in_place=True, write_to_self=False):
        """
        Add a matrix. The sum of self._raw_matrix with the passed StateMatrix (other).
        
        Args:
            other: another StateMatrix object of the same type as this object
            in_place: If True, matrix addition is applied (in-place) to (self)
                If False, a new copy will be returned.
            
        Returns:
            The sum of self with the passed StateMatrix (other).
        
        """
        if write_to_self:
            # update the reference matrix inside this object.
            if not in_place:
                result_mat = self.copy()
            else:
                result_mat = self
            if isinstance(other, (StateMatrixNumpy, self.__class__)):
                source_matrix = other
                source_matrix_ref = other._raw_matrix
            elif isinstance(other, np.ndarray):
                source_matrix = other
                source_matrix_ref = other
            else:
                raise TypeError(
                    "matrix has to be either 'StateMatrixNumpy', or 'StateMatrixSpSciPy', or 'np.ndarray' "
                )
        else:
            # the target is the input matrix or a copy of it
            if not in_place:
                result_mat = other.copy()
            else:
                result_mat = other
            source_matrix = self
            source_matrix_ref = self._raw_matrix
        #
        # Check the result matrix format
        if isinstance(result_mat, self.__class__):
            result_mat_ref = result_mat.get_raw_matrix_ref()
            #
            # frmt = result_mat._raw_matrix.getformat()
            # print('\n xxxxxxxxxxxxxxxxx \n %s \n xxxxxxxxxxxxxxxxx \n' % frmt)
            #
            if sparse.isspmatrix_bsr(result_mat._raw_matrix):
                result_mat_ref = sparse.bsr_matrix(result_mat_ref +
                                                   source_matrix_ref)
            elif sparse.isspmatrix_coo(result_mat._raw_matrix):
                result_mat_ref = sparse.coo_matrix(result_mat_ref +
                                                   source_matrix_ref)
            elif sparse.isspmatrix_csc(result_mat._raw_matrix):
                result_mat_ref = sparse.csc_matrix(result_mat_ref +
                                                   source_matrix_ref)
            elif sparse.isspmatrix_csr(result_mat._raw_matrix):
                result_mat_ref = sparse.csr_matrix(result_mat_ref +
                                                   source_matrix_ref)
                # print(result_mat._raw_matrix)
                # print("is sparse: ", sparse.issparse(result_mat._raw_matrix))
            elif sparse.isspmatrix_dia(result_mat._raw_matrix):
                result_mat_ref = sparse.dia_matrix(result_mat_ref +
                                                   source_matrix_ref)
            elif sparse.isspmatrix_dok(result_mat._raw_matrix):
                result_mat_ref = sparse.dok_matrix(result_mat_ref +
                                                   source_matrix_ref)
            elif sparse.isspmatrix_lil(result_mat._raw_matrix):
                result_mat_ref = sparse.lil_matrix(result_mat_ref +
                                                   source_matrix_ref)
            else:
                raise TypeError(
                    "Unsupported Format! My format has been tapered with!")
            result_mat.set_raw_matrix_ref(result_mat_ref)
            result_mat._update_attributes()

        elif isinstance(result_mat, StateMatrixNumpy):
            result_mat_ref = result_mat.get_raw_matrix_ref()
            if isinstance(source_matrix, self.__class__):
                result_mat_ref = result_mat_ref + source_matrix_ref
                try:
                    result_mat_ref = result_mat_ref.toarray()
                except AttributeError:
                    result_mat_ref = np.asarray(result_mat_ref)
            elif isinstance(source_matrix, (np.ndarray, StateMatrixNumpy)):
                result_mat_ref = result_mat_ref + source_matrix_ref

            result_mat.set_raw_matrix_ref(result_mat_ref)

        elif isinstance(result_mat, np.ndarray):
            result_mat_ref = result_mat
            if isinstance(source_matrix, self.__class__):
                result_mat_ref = result_mat_ref + source_matrix_ref
                try:
                    result_mat_ref = result_mat_ref.toarray()
                except AttributeError:
                    result_mat_ref = np.asarray(result_mat_ref)
            elif isinstance(source_matrix, (np.ndarray, StateMatrixNumpy)):
                result_mat_ref = result_mat_ref + source_matrix_ref

        else:
            type.mro(type(other))
            print(type.mro(type(other)))
            print(other)
            raise TypeError(
                "matrix has to be either 'StateMatrixNumpy', or 'StateMatrixSpSciPy', or 'np.ndarray' "
            )
            # raise TypeError("matrix has to be either 'StateMatrixNumpy', or 'StateMatrixSpSciPy'! ")
        #
        return result_mat
Esempio n. 33
0
def is_list_of_lil(z):
    return isinstance(z, list) and sparse.isspmatrix_lil(z[0])
Esempio n. 34
0
def is_lil(z):
    return sparse.isspmatrix_lil(z)