def apply(self, A=None, b=None): # first we need to know the boundary facets location_mask = self.fe_space.mesh.boundary(fnc=self.domain) # then we need the dofs that are associated with those entities dim = self.fe_space.mesh.dimension - 1 dir_dof_mask = self.fe_space.extract_dofs(d=dim, mask=location_mask) dir_dof_ind = dir_dof_mask.nonzero()[0] # replace every row of a dirichlet dof # with a row that has the single value 1 # in the corresponding column if not sparse.isspmatrix_lil(A): raise TypeError('Wrong sparse matrix format ({})'.format(A.format)) dir_dof_rows = dir_dof_ind[:,None].tolist() dir_dof_data = [[1.]] * dir_dof_ind.size A.rows[dir_dof_ind] = dir_dof_rows A.data[dir_dof_ind] = dir_dof_data # set the corresponding load vector entries as the # l2 projection of the boundary function l2_proj = L2Projection.project(fnc=self.ud, fe_space=self.fe_space, codim=1, mask=None) b[dir_dof_ind] = l2_proj[dir_dof_ind, None] return A, b
def threshold_to_zero(mx, threshold): """Set value in a sparse matrix lower than threshold to zero. Return the 'coo' format sparse matrix. Parameters ---------- mx : array_like Sparse matrix. threshold : float Threshold parameter. """ high_values_indexes = set(zip(*((np.abs(mx) >= threshold).nonzero()))) nonzero_indexes = zip(*(mx.nonzero())) if not sp.isspmatrix_lil(mx): mx = mx.tolil() for s in nonzero_indexes: if s not in high_values_indexes: mx[s] = 0.0 mx = mx.tocoo() mx.eliminate_zeros() return mx
def setdiag_range(mat,arr,ind=(),k=0): """ Similar to instance method setdiag but with the option to specify a specific range along the diagonal. Currently only works with matrices of type lil_matrix. """ if not sp.isspmatrix_lil(mat): raise ValueError('argument mat must be of type scipy.sparse.lil_matrix') if not isinstance(arr,(list,tuple)): raise ValueError('argument arr must be of type list or tuple') if not isinstance(ind,(list,tuple)): raise ValueError('argument ind must be of type list or tuple') if ind[0] == 0: mat.setdiag(arr[0:ind[1]],k) diag_size = min(mat.shape) - abs(k); do_fill = False if ind[1] < 0 and ind[0] < ind[1] and diag_size + ind[1] > 0: start_ind = max(diag_size + ind[0],0); end_ind = diag_size + ind[1] do_fill = True elif ind[0] > 0 and ind[1] >= ind[0] and ind[0] < diag_size: start_ind = ind[0]; end_ind = min(ind[1],diag_size - 1) do_fill = True if do_fill: j = 0 if k == 0: for i in xrange(start_ind,end_ind + 1): if j < len(arr): mat[i,i] = arr[j]; j += 1 else: break elif k > 0: for i in xrange(start_ind,end_ind + 1): if j < len(arr): mat[i,i + k] = arr[j]; j += 1 else: break
def _slice(data, obs_selector=None, vars_selector=None): """ Slice date using any selector that the AnnData object supprots for slicing. If selector is None, will not slice on that axis. This method exists to optimize filtering/slicing sparse data that has access patterns which impact slicing performance. https://docs.scipy.org/doc/scipy/reference/sparse.html """ prefer_row_access = (sparse.isspmatrix_csr(data._X) or sparse.isspmatrix_lil(data._X) or sparse.isspmatrix_bsr(data._X)) if prefer_row_access: # Row-major slicing if obs_selector is not None: data = data[obs_selector, :] if vars_selector is not None: data = data[:, vars_selector] else: # Col-major slicing if vars_selector is not None: data = data[:, vars_selector] if obs_selector is not None: data = data[obs_selector, :] return data
def sparse_remove_row(X, to_remove): if not sps.isspmatrix_lil(X): X = X.tolil() to_keep = [i for i in xrange(0, X.shape[0]) if i not in to_remove] Y = sps.vstack([X.getrowview(i) for i in to_keep]) return Y
def _set_weight_class(adata: AnnData, key: str) -> W: X = adata.obsp[key] if not isspmatrix_lil(X): X = X.tolil() neighbors = dict(enumerate(X.rows)) weights = dict(enumerate(X.data)) return libpysal.weights.W(neighbors, weights, ids=adata.obs.index.values)
def abs_sparse(X): """ Element-wise absolute value of sparse matrix """ X_abs = X.copy() if sparse.isspmatrix_csr(X) or sparse.isspmatrix_csc(X): X_abs.data = np.abs(X_abs.data) elif sparse.isspmatrix_lil(X): X_abs.data = np.array([np.abs(L) for L in X_abs.data]) else: raise ValueError("Only supports CSR/CSC and LIL matrices") return X_abs
def matrix_conflicts(L): """ Given an N x M matrix where L_{i,j} is the label given by the jth LF to the ith candidate: Return the **fraction of candidates that each LF _conflicts with other LFs on_.** """ B = L.copy() if not sparse.issparse(B): for row in range(B.shape[0]): if np.unique(np.array(B[row][np.nonzero(B[row])])).size == 1: B[row] = 0 return matrix_coverage(sparse_nonzero(B)) if not (sparse.isspmatrix_csc(B) or sparse.isspmatrix_lil(B) or sparse.isspmatrix_csr(B)): raise ValueError("Only supports CSR/CSC and LIL matrices") if sparse.isspmatrix_csc(B) or sparse.isspmatrix_lil(B): B = B.tocsr() for row in range(B.shape[0]): if np.unique(B.getrow(row).data).size == 1: B.data[B.indptr[row]:B.indptr[row+1]] = 0 return matrix_coverage(sparse_nonzero(B))
def sparse_matrix_report(m): print(repr(m)) print('Number of non-zeros :', m.nnz) print('Sparsity :', 1 - m.nnz / (m.shape[0] * m.shape[1])) if isspmatrix_csr(m) or isspmatrix_csc(m): print('data length : {} ({})'.format(len(m.data), m.data.dtype)) print('indptr length : {} ({})'.format(len(m.indptr), m.indptr.dtype)) print('indices length : {} ({})'.format(len(m.indices), m.indices.dtype)) print('Size :', size(m.data.nbytes + m.indptr.nbytes + m.indices.nbytes)) print('10 x 10 preview:') print(m[:10, :10].toarray()) elif isspmatrix_bsr(m): print('data length : {} ({})'.format(len(m.data), m.data.dtype)) print('indptr length : {} ({})'.format(len(m.indptr), m.indptr.dtype)) print('indices length : {} ({})'.format(len(m.indices), m.indices.dtype)) print('blocksize length : {}'.format(m.blocksize)) print('Size :', size(m.data.nbytes + m.indptr.nbytes + m.indices.nbytes)) print('preview:') print(m) elif isspmatrix_coo(m): print('data length : {} ({})'.format(len(m.data), m.data.dtype)) print('row length : {} ({})'.format(len(m.row), m.row.dtype)) print('col length : {} ({})'.format(len(m.col), m.col.dtype)) print('Size :', size(m.data.nbytes + m.row.nbytes + m.col.nbytes)) print('preview:') print(m) elif isspmatrix_dok(m): print('Size :', size(sys.getsizeof(m))) print('10 x 10 preview:') print(m[:10, :10].toarray()) elif isspmatrix_dia(m): print('data length : {} ({})'.format(len(m.data), m.data.dtype)) print('Offsets : {} ({})'.format(len(m.offsets), m.offsets.dtype)) print('Size :', size(m.data.nbytes + m.offsets.nbytes)) print('(no preview)') elif isspmatrix_lil(m): print('data length : {} ({})'.format(len(m.data), m.data.dtype)) print('rows : {} ({})'.format(len(m.rows), m.rows.dtype)) print('Size :', size(m.data.nbytes + m.rows.nbytes)) print('(no preview)')
def _fix_connectivity(X, connectivity, affinity): """ Fixes the connectivity matrix - copies it - makes it symmetric - converts it to LIL if necessary - completes it if necessary """ n_samples = X.shape[0] if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples: raise ValueError( "Wrong shape for connectivity matrix: %s when X is %s" % (connectivity.shape, X.shape) ) # Make the connectivity matrix symmetric: connectivity = connectivity + connectivity.T # Convert connectivity matrix to LIL if not sparse.isspmatrix_lil(connectivity): if not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() # Compute the number of nodes n_connected_components, labels = connected_components(connectivity) if n_connected_components > 1: warnings.warn( "the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_connected_components, stacklevel=2, ) # XXX: Can we do without completing the matrix? for i in range(n_connected_components): idx_i = np.where(labels == i)[0] Xi = X[idx_i] for j in range(i): idx_j = np.where(labels == j)[0] Xj = X[idx_j] if affinity == "precomputed": D = X[np.ix_(idx_i, idx_j)] else: D = pairwise_distances(Xi, Xj, metric=affinity) ii, jj = np.where(D == np.min(D)) ii = ii[0] jj = jj[0] connectivity[idx_i[ii], idx_j[jj]] = True connectivity[idx_j[jj], idx_i[ii]] = True return connectivity, n_connected_components
def sparse_nonzero(X): """Sparse matrix with value 1 for i,jth entry !=0""" X_nonzero = X.copy() if not sparse.issparse(X): X_nonzero[X_nonzero != 0] = 1 return X_nonzero if sparse.isspmatrix_csr(X) or sparse.isspmatrix_csc(X): X_nonzero.data[X_nonzero.data != 0] = 1 elif sparse.isspmatrix_lil(X): X_nonzero.data = [np.ones(len(L)) for L in X_nonzero.data] else: raise ValueError("Only supports CSR/CSC and LIL matrices") return X_nonzero
def fit(self, k=100, max_iter=15): if self.recommender_data.preference_matrix.shape[1] < k: k = self.recommender_data.preference_matrix.shape[1] if not spMat.isspmatrix_lil(self.recommender_data.preference_matrix): self.recommender_data.preference_matrix = spMat.lil_matrix(self.recommender_data.preference_matrix) self.perp = None model = LDA_CVB0(self.recommender_data.preference_matrix, K=k) model.lda_learning(max_iter) self.user_matrix = model.documentdist() self.item_matrix = model.worddist()
def _fix_connectivity(X, connectivity, n_components=None, affinity="euclidean"): """ Fixes the connectivity matrix - copies it - makes it symmetric - converts it to LIL if necessary - completes it if necessary """ n_samples = X.shape[0] if (connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples): raise ValueError('Wrong shape for connectivity matrix: %s ' 'when X is %s' % (connectivity.shape, X.shape)) # Make the connectivity matrix symmetric: connectivity = connectivity + connectivity.T # Convert connectivity matrix to LIL if not sparse.isspmatrix_lil(connectivity): if not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() # Compute the number of nodes n_components, labels = connected_components(connectivity) if n_components > 1: warnings.warn("the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_components, stacklevel=2) # XXX: Can we do without completing the matrix? for i in xrange(n_components): idx_i = np.where(labels == i)[0] Xi = X[idx_i] for j in xrange(i): idx_j = np.where(labels == j)[0] Xj = X[idx_j] D = pairwise_distances(Xi, Xj, metric=affinity) ii, jj = np.where(D == np.min(D)) ii = ii[0] jj = jj[0] connectivity[idx_i[ii], idx_j[jj]] = True connectivity[idx_j[jj], idx_i[ii]] = True n_components = 1 return connectivity
def should_enforce_sparse(m, sparse_format: SparseFormat, policy: SparsePolicy, dtype, sparse_values: bool = True) -> bool: """ Returns whether it is preferable to convert a given matrix into a `scipy.sparse.csr_matrix`, `scipy.sparse.csc_matrix` or `scipy.sparse.dok_matrix`, depending on the format of the given matrix and a given `SparsePolicy`: If the given policy is `SparsePolicy.AUTO`, the matrix will be converted into the given sparse format, if possible, if the sparse matrix is expected to occupy less memory than a dense matrix. To be able to convert the matrix into a sparse format, it must be a `scipy.sparse.lil_matrix`, `scipy.sparse.dok_matrix` or `scipy.sparse.coo_matrix`. If the given sparse format is `csr` or `csc` and the matrix is a already in that format, it will not be converted. If the given policy is `SparsePolicy.FORCE_DENSE`, the matrix will always be converted into the specified sparse format, if possible. If the given policy is `SparsePolicy.FORCE_SPARSE`, the matrix will always be converted into a dense matrix. :param m: A `np.ndarray` or `scipy.sparse.matrix` to be checked :param sparse_format: The `SparseFormat` to be used :param policy: The `SparsePolicy` to be used :param dtype: The type of the values that should be stored in the matrix :param sparse_values: True, if the values must explicitly be stored when using a sparse format, False otherwise :return: True, if it is preferable to convert the matrix into a sparse matrix of the given format, False otherwise """ if not issparse(m): # Given matrix is dense if policy != SparsePolicy.FORCE_SPARSE: return False elif (isspmatrix_csr(m) and sparse_format == SparseFormat.CSR) or ( isspmatrix_csc(m) and sparse_format == SparseFormat.CSC): # Matrix is a `scipy.sparse.csr_matrix` or `scipy.sparse.csc_matrix` and is already in the given sparse format return policy != SparsePolicy.FORCE_DENSE elif isspmatrix_lil(m) or isspmatrix_coo(m) or isspmatrix_dok(m): # Given matrix is in a format that might be converted into the specified sparse format if policy == SparsePolicy.AUTO: return is_sparse(m, sparse_format=sparse_format, dtype=dtype, sparse_values=sparse_values) else: return policy == SparsePolicy.FORCE_SPARSE raise ValueError('Matrix of type ' + type(m).__name__ + ' cannot be converted to format "' + str(sparse_format) + '""')
def threshold_to_zero(mx, threshold): """Set value in a sparse matrix lower than threshold to zero. """ high_values_indexes = set(zip(*((np.abs(mx) >= threshold).nonzero()))) nonzero_indexes = zip(*(mx.nonzero())) if not sp.isspmatrix_lil(mx): mx = mx.tolil() for s in nonzero_indexes: if s not in high_values_indexes: mx[s] = 0.0 mx = mx.tocoo() mx.eliminate_zeros() return mx
def setdiag_range(mat, arr, ind=(), k=0): """ Similar to instance method setdiag but with the option to specify a specific range along the diagonal. Currently only works with matrices of type lil_matrix. """ if not sp.isspmatrix_lil(mat): raise ValueError( 'argument mat must be of type scipy.sparse.lil_matrix') if not isinstance(arr, (list, tuple)): raise ValueError('argument arr must be of type list or tuple') if not isinstance(ind, (list, tuple)): raise ValueError('argument ind must be of type list or tuple') if ind[0] == 0: mat.setdiag(arr[0:ind[1]], k) diag_size = min(mat.shape) - abs(k) do_fill = False if ind[1] < 0 and ind[0] < ind[1] and diag_size + ind[1] > 0: start_ind = max(diag_size + ind[0], 0) end_ind = diag_size + ind[1] do_fill = True elif ind[0] > 0 and ind[1] >= ind[0] and ind[0] < diag_size: start_ind = ind[0] end_ind = min(ind[1], diag_size - 1) do_fill = True if do_fill: j = 0 if k == 0: for i in range(start_ind, end_ind + 1): if j < len(arr): mat[i, i] = arr[j] j += 1 else: break elif k > 0: for i in range(start_ind, end_ind + 1): if j < len(arr): mat[i, i + k] = arr[j] j += 1 else: break
def sparse_remove_row(X, to_remove): """ Delete rows from a sparse matrix Parameters ---------- X : scipy.sparse matrix to_remove : a list of row indices to be removed. Returns ------- Y : scipy.sparse matrix """ if not sps.isspmatrix_lil(X): X = X.tolil() to_keep = [i for i in xrange(0, X.shape[0]) if i not in to_remove] Y = sps.vstack([X.getrowview(i) for i in to_keep]) return Y
def sparse_remove_row(X, to_remove): """ Delete rows from a sparse matrix Parameters ---------- X : scipy.sparse matrix to_remove : a list of row indices to be removed. Returns ------- Y : scipy.sparse matrix """ if not sps.isspmatrix_lil(X): X = X.tolil() to_keep = [i for i in range(0, X.shape[0]) if i not in to_remove] Y = sps.vstack([X.getrowview(i) for i in to_keep]) return Y
def _ispmatrix_all(matrix): """ Iterator for iterating rows and columns for non-zero elements in a `scipy.sparse.*_matrix` (or `SparseCSR`) Parameters ---------- matrix : scipy.sparse.sp_matrix the sparse matrix to iterate non-zero elements Yields ------ int, int the row, column indices of the non-zero elements """ if isspmatrix_csr(matrix): for r in range(matrix.shape[0]): for ind in range(matrix.indptr[r], matrix.indptr[r + 1]): yield r, matrix.indices[ind] elif isspmatrix_lil(matrix): for r in range(matrix.shape[0]): for c in matrix.rows[r]: yield r, c elif isspmatrix_coo(matrix): for r, c in zip(matrix.row, matrix.col): yield r, c elif isspmatrix_csc(matrix): for c in range(matrix.shape[1]): for ind in range(matrix.indptr[c], matrix.indptr[c + 1]): yield matrix.indices[ind], c elif isinstance(matrix, SparseCSR): for r in range(matrix.shape[0]): n = matrix.ncol[r] ptr = matrix.ptr[r] for c in matrix.col[ptr:ptr + n]: yield r, c else: raise NotImplementedError( "The iterator for this sparse matrix has not been implemented")
def spiter(matrix): """ Iterator for iterating the elements in a ``scipy.sparse.*_matrix`` This will always return: >>> (row, column, matrix-element) Currently this can iterate `coo`, `csc`, `lil` and `csr`, others may easily be added. Parameters ---------- matrix : ``scipy.sparse.sp_matrix`` the sparse matrix to iterate non-zero elements References ---------- By stackoverflow user zeroth on https://stackoverflow.com/a/42625707 """ if isspmatrix_coo(matrix): for r, c, m in zip(matrix.row, matrix.col, matrix.data): yield r, c, m elif isspmatrix_csc(matrix): for c in range(matrix.shape[1]): for ind in range(matrix.indptr[c], matrix.indptr[c+1]): yield matrix.indices[ind], c, matrix.data[ind] elif isspmatrix_csr(matrix): for r in range(matrix.shape[0]): for ind in range(matrix.indptr[r], matrix.indptr[r+1]): yield r, matrix.indices[ind], matrix.data[ind] elif isspmatrix_lil(matrix): for r in range(matrix.shape[0]): for c, d in zip(matrix.rows[r], matrix.data[r]): yield r, c, d else: raise NotImplementedError("The iterator for this sparse matrix has not been implemented")
def ward_tree(X, connectivity=None, n_components=None, copy=True): """Ward clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account a some topological structure between samples. Parameters ---------- X : array of shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix. connectivity matrix. Defines for each sample the neigbhoring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. copy : bool (optional) Make a copy of connectivity or work inplace. If connectivity is not of LIL type there will be a copy in any case. Returns ------- children : list of pairs. Lenght of n_nodes list of the children of each nodes. Leaves of the tree have empty list of children. n_components : sparse matrix. The number of connected components in the graph. n_leaves : int The number of leaves in the tree """ X = np.asarray(X) n_samples, n_features = X.shape if X.ndim == 1: X = np.reshape(X, (-1, 1)) if connectivity is None: out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) return children_, 1, n_samples # Compute the number of nodes if n_components is None: n_components, labels = cs_graph_components(connectivity) # Convert connectivity matrix to LIL with a copy if needed if sparse.isspmatrix_lil(connectivity) and copy: connectivity = connectivity.copy() else: connectivity = connectivity.tolil() if n_components > 1: warnings.warn("the number of connected components of the" " connectivity matrix is %d > 1. Completing it to avoid" " stopping the tree early." % n_components) connectivity = _fix_connectivity(X, connectivity, n_components, labels) n_components = 1 n_nodes = 2 * n_samples - n_components if (connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples): raise ValueError('Wrong shape for connectivity matrix: %s ' 'when X is %s' % (connectivity.shape, X.shape)) # Remove diagonal from connectivity matrix connectivity.setdiag(np.zeros(connectivity.shape[0])) # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.int) coord_col = np.array(coord_col, dtype=np.int) # build moments as a list moments_1 = np.zeros(n_nodes) moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features)) moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = zip(inertia, coord_row, coord_col) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.int) heights = np.zeros(n_nodes) used_node = np.ones(n_nodes, dtype=bool) children = [] visited = np.empty(n_nodes, dtype=bool) # recursive merge loop for k in xrange(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j], heights[k] = k, k, inert children.append([i, j]) used_node[i] = used_node[j] = False # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] visited[:] = False visited[k] = True for l in set(A[i]).union(A[j]): l = _hierarchical._get_parent(l, parent) if not visited[l]: visited[l] = True coord_col.append(l) A[l].append(k) A.append(coord_col) coord_col = np.array(coord_col, dtype=np.int) coord_row = np.empty_like(coord_col) coord_row.fill(k) ini = np.empty(len(coord_row), dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) for tupl in itertools.izip(ini, coord_row, coord_col): heappush(inertia, tupl) # Separate leaves in children (empty lists up to now) n_leaves = n_samples children = np.array(children) # return numpy array for efficient caching return children, n_components, n_leaves
def plot_activations_density(z_hat, n_times_atom, sfreq=1., threshold=0.01, bandwidth='auto', axes=None, t_min=0, plot_activations=False, colors=None): """ Parameters ---------- z_hat : array, shape (n_atoms, n_trials, n_times_valid) The sparse activation matrix. n_times_atom : int The support of the atom. sfreq : float Sampling frequency threshold : float Remove activations (normalized with the max) below this threshold bandwidth : float, array of float, or 'auto' Bandwidth (in sec) of the kernel axes : array of axes, or None Axes to plot into t_min : float Time offset for the xlabel display plot_activations : boolean If True, the significant activations are plotted as black dots colors : list of matplotlib compatible colors Colors of the plots """ if sparse.isspmatrix_lil(z_hat[0]): z_hat = np.array([z.toarray() for z in z_hat]) n_atoms, n_trials, n_times_valid = z_hat.shape # sum activations over all trials z_hat_sum = z_hat.sum(axis=0) if bandwidth == 'auto': bandwidth = n_times_atom if axes is None: fig, axes = plt.subplots(n_atoms, num='density', figsize=(8, 2 + n_atoms * 3)) axes = np.atleast_1d(axes) if colors is None: colors = itertools.cycle(COLORS) for ax, activations, color in zip(axes.ravel(), z_hat_sum, colors): ax.clear() time_instants = np.arange(n_times_valid) / float(sfreq) + t_min selection = activations > threshold * z_hat_sum.max() n_elements = selection.sum() if n_elements == 0: ax.plot(time_instants, np.zeros_like(time_instants)) continue # plot the activations as black dots if plot_activations: ax.plot(time_instants[selection], activations[selection] / activations[selection].max(), '.', color='k') window = np.blackman(bandwidth) smooth_activations = np.convolve(activations, window, 'same') ax.fill_between(time_instants, smooth_activations, color=color, alpha=0.5) return axes
def transform(dayuserwords, userregionmap, ndays): """ userwords - a matrix containing a user per row grouped by dayuserwords userregionmap - a dictionary of user index to region ndays - number of days in userwords returns: regiondayuserword - sparse matrix containing users grouped by days grouped by region regiondayworduser - sparse matrix containing words grouped by days grouped by region """ regionusermap = dict([(x - 1, []) for x in set(userregionmap.values())]) for user, region in userregionmap.items(): regionusermap[region - 1] += [user] N = ndays U = dayuserwords.shape[0] / N W = dayuserwords.shape[1] R = len(regionusermap) missing_users = array(list(set(range(U)) - set(userregionmap.keys()))) logger.debug("Preparing Output Matrices") regiondayuserword = None regiondayworduser = None # dayuserwords_r = ssp.csr_matrix(dayuserwords) if not ssp.isspmatrix_lil(dayuserwords): logger.debug("The data array must be lil, transforming...") dayuserwords = dayuserwords.tolil() logger.debug("Filling (R x D x U, W) matrix") rows = [] data = [] for r in range(R): logger.debug("Starting region: %d" % r) rusers = set(regionusermap[r]) for n in range(N): logger.debug("Starting day: %d" % n) for u in range(U): if u not in rusers: rows += [[]] data += [[]] else: i = n * U + u rows += [dayuserwords.rows[i]] data += [dayuserwords.data[i]] regiondayuserword = ssp.lil_matrix((1, 1), dtype=dayuserwords.dtype) regiondayuserword.data = data regiondayuserword.rows = rows regiondayuserword._shape = (R * N * U, W) logger.debug("... cleaning up Filling (R x D x U, W) matrix") regiondayuserword = ssp.csr_matrix(regiondayuserword) logger.debug("Filling (R x D x W, U) matrix") rduw_x = regiondayuserword[:U, :] regiondayworduser = ssp.coo_matrix(rduw_x.T) for x in xrange(1, R * N): rduw_x = regiondayuserword[x * U:(x + 1) * U, :] regiondayworduser = ssp.vstack((regiondayworduser, rduw_x.T)) logger.debug("... cleaning up Filling (R x D x W, U) matrix") regiondayworduser = ssp.csr_matrix(regiondayworduser) return regiondayuserword, regiondayworduser
def ispmatrixd(matrix, map_row=None, map_col=None): """ Iterator for iterating rows, columns and data for non-zero elements in a `scipy.sparse.*_matrix` (or `SparseCSR`) Parameters ---------- matrix : scipy.sparse.sp_matrix the sparse matrix to iterate non-zero elements map_row : func, optional map each row entry through the function `map_row`, defaults to `None` which is equivalent to no mapping. map_col : func, optional map each column entry through the function `map_col`, defaults to `None` which is equivalent to no mapping. Yields ------ int, int, <> the row, column and data of the non-zero elements """ if map_row is None: map_row = lambda x: x if map_col is None: map_col = lambda x: x # Consider using the numpy nditer function for buffered iterations #it = np.nditer([geom.o2a(tmp.row), geom.o2a(tmp.col % geom.no), tmp.data], # flags=['buffered'], op_flags=['readonly']) if isspmatrix_csr(matrix): for r in range(matrix.shape[0]): rr = map_row(r) for ind in range(matrix.indptr[r], matrix.indptr[r + 1]): yield rr, map_col(matrix.indices[ind]), matrix.data[ind] elif isspmatrix_lil(matrix): for r in range(matrix.shape[0]): rr = map_row(r) for c, m in zip(map_col(matrix.rows[r]), matrix.data[r]): yield rr, c, m elif isspmatrix_coo(matrix): for r, c, m in zip(map_row(matrix.row), map_col(matrix.col), matrix.data): yield r, c, m elif isspmatrix_csc(matrix): for c in range(matrix.shape[1]): cc = map_col(c) for ind in range(matrix.indptr[c], matrix.indptr[c + 1]): yield map_row(matrix.indices[ind]), cc, matrix.data[ind] elif isinstance(matrix, SparseCSR): for r in range(matrix.shape[0]): rr = map_row(r) n = matrix.ncol[r] if n == 0: continue ptr = matrix.ptr[r] sl = slice(ptr, ptr + n, None) for c, d in zip(map_col(matrix.col[sl]), matrix._D[sl, :]): yield rr, c, d else: raise NotImplementedError( "The iterator for this sparse matrix has not been implemented")
def _fix_connectivity(X, connectivity, affinity): """ Fixes the connectivity matrix. The different steps are: - copies it - makes it symmetric - converts it to LIL if necessary - completes it if necessary. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix representing `n_samples` samples to be clustered. connectivity : sparse matrix, default=None Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is `None`, i.e, the Ward algorithm is unstructured. affinity : {"euclidean", "precomputed"}, default="euclidean" Which affinity to use. At the moment `precomputed` and ``euclidean`` are supported. `euclidean` uses the negative squared Euclidean distance between points. Returns ------- connectivity : sparse matrix The fixed connectivity matrix. n_connected_components : int The number of connected components in the graph. """ n_samples = X.shape[0] if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples: raise ValueError( "Wrong shape for connectivity matrix: %s when X is %s" % (connectivity.shape, X.shape) ) # Make the connectivity matrix symmetric: connectivity = connectivity + connectivity.T # Convert connectivity matrix to LIL if not sparse.isspmatrix_lil(connectivity): if not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() # Compute the number of nodes n_connected_components, labels = connected_components(connectivity) if n_connected_components > 1: warnings.warn( "the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_connected_components, stacklevel=2, ) # XXX: Can we do without completing the matrix? connectivity = _fix_connected_components( X=X, graph=connectivity, n_connected_components=n_connected_components, component_labels=labels, metric=affinity, mode="connectivity", ) return connectivity, n_connected_components
def _update_z_multi_idx(X_i, D, reg, z0_i, debug, solver='l-bfgs', solver_kwargs=dict(), freeze_support=False, loss='l2', loss_params=dict(), timing=False): t_start = time.time() n_channels, n_times = X_i.shape if D.ndim == 2: n_atoms, n_channels_n_times_atom = D.shape n_times_atom = n_channels_n_times_atom - n_channels else: n_atoms, n_channels, n_times_atom = D.shape n_times_valid = n_times - n_times_atom + 1 assert not (freeze_support and z0_i is None), 'Impossible !' if is_lil(z0_i) and solver != "lgcd": raise NotImplementedError() constants = {} if solver == "lgcd": constants['DtD'] = compute_DtD(D=D, n_channels=n_channels) init_timing = time.time() - t_start def func_and_grad(zi): return gradient_zi(Xi=X_i, zi=zi, D=D, constants=constants, reg=reg, return_func=True, flatten=True, loss=loss, loss_params=loss_params) if z0_i is None: f0 = np.zeros(n_atoms * n_times_valid) elif is_lil(z0_i): f0 = z0_i else: f0 = z0_i.reshape(n_atoms * n_times_valid) times, pobj = None, None if timing: times = [init_timing] pobj = [func_and_grad(f0)[0]] t_start = [time.time()] if solver == 'l-bfgs': if freeze_support: bounds = [(0, 0) if z == 0 else (0, None) for z in f0] else: bounds = BoundGenerator(n_atoms * n_times_valid) if timing: def callback(xk): times.append(time.time() - t_start[0]) pobj.append(func_and_grad(xk)[0]) # use a reference to have access inside this function t_start[0] = time.time() else: callback = None factr = solver_kwargs.get('factr', 1e15) # default value maxiter = solver_kwargs.get('maxiter', 15000) # default value z_hat, f, d = optimize.fmin_l_bfgs_b(func_and_grad, f0, fprime=None, args=(), approx_grad=False, bounds=bounds, factr=factr, maxiter=maxiter, callback=callback) elif solver in ("ista", "fista"): # Default args fista_kwargs = dict(max_iter=100, eps=None, verbose=0, restart=None, scipy_line_search=False, momentum=(solver == "fista")) fista_kwargs.update(solver_kwargs) def objective(z_hat): return func_and_grad(z_hat)[0] def grad(z_hat): return func_and_grad(z_hat)[1] def prox(z_hat, ): return np.maximum(z_hat, 0.) output = fista(objective, grad, prox, None, f0, adaptive_step_size=True, timing=timing, name="Update z", **fista_kwargs) if timing: z_hat, pobj, times = output times[0] += init_timing else: z_hat, pobj = output elif solver == "lgcd": if not sparse.isspmatrix_lil(f0): f0 = f0.reshape(n_atoms, n_times_valid) # Default values tol = solver_kwargs.get('tol', 1e-1) n_seg = solver_kwargs.get('n_seg', 'auto') max_iter = solver_kwargs.get('max_iter', 1e15) strategy = solver_kwargs.get('strategy', 'greedy') output = _coordinate_descent_idx(X_i, D, constants, reg=reg, z0=f0, freeze_support=freeze_support, tol=tol, max_iter=max_iter, n_seg=n_seg, strategy=strategy, timing=timing, name="Update z") if timing: z_hat, pobj, times = output times[0] += init_timing else: z_hat = output else: raise ValueError("Unrecognized solver %s. Must be 'ista', 'fista'," " or 'l-bfgs'." % solver) if not is_lil(z_hat): z_hat = z_hat.reshape(n_atoms, n_times_valid) if loss == 'l2': if not is_lil(z_hat): ztz = compute_ztz(z_hat[None], n_times_atom) ztX = compute_ztX(z_hat[None], X_i[None]) else: cython_code._assert_cython() ztz = cython_code._fast_compute_ztz([z_hat], n_times_atom) ztX = cython_code._fast_compute_ztX([z_hat], X_i[None]) else: ztz, ztX = None, None return z_hat, ztz, ztX, pobj, times
def ispmatrix(matrix, map_row=None, map_col=None): """ Iterator for iterating rows and columns for non-zero elements in a `scipy.sparse.*_matrix` (or `SparseCSR`) If either `map_row` or `map_col` are not None the generator will only yield the unique values. Parameters ---------- matrix : scipy.sparse.sp_matrix the sparse matrix to iterate non-zero elements map_row : func, optional map each row entry through the function `map_row`, defaults to `None` which is equivalent to no mapping. map_col : func, optional map each column entry through the function `map_col`, defaults to `None` which is equivalent to no mapping. Yields ------ int, int the row, column indices of the non-zero elements """ if map_row is None and map_col is None: # Skip unique checks for r, c in _ispmatrix_all(matrix): yield r, c return if map_row is None: map_row = lambda x: x if map_col is None: map_col = lambda x: x map_row = np.vectorize(map_row) map_col = np.vectorize(map_col) nrow = len(unique(map_row(arange(matrix.shape[0], dtype=np.int32)))) ncol = len(unique(map_col(arange(matrix.shape[1], dtype=np.int32)))) rows = zeros(nrow, dtype=np.bool_) cols = zeros(ncol, dtype=np.bool_) # Initialize the unique arrays rows[:] = False # Consider using the numpy nditer function for buffered iterations #it = np.nditer([geom.o2a(tmp.row), geom.o2a(tmp.col % geom.no), tmp.data], # flags=['buffered'], op_flags=['readonly']) if isspmatrix_csr(matrix): for r in range(matrix.shape[0]): rr = map_row(r) if rows[rr]: continue rows[rr] = True cols[:] = False for ind in range(matrix.indptr[r], matrix.indptr[r + 1]): c = map_col(matrix.indices[ind]) if cols[c]: continue cols[c] = True yield rr, c elif isspmatrix_lil(matrix): for r in range(matrix.shape[0]): rr = map_row(r) if rows[rr]: continue rows[rr] = True cols[:] = False if len(matrix.rows[r]) == 0: continue for c in map_col(matrix.rows[r]): if cols[c]: continue cols[c] = True yield rr, c elif isspmatrix_coo(matrix): raise ValueError( "mapping and unique returns are not implemented for COO matrix") elif isspmatrix_csc(matrix): raise ValueError( "mapping and unique returns are not implemented for CSC matrix") elif isinstance(matrix, SparseCSR): for r in range(matrix.shape[0]): rr = map_row(r) if rows[rr]: continue rows[rr] = True cols[:] = False n = matrix.ncol[r] if n == 0: continue ptr = matrix.ptr[r] for c in map_col(matrix.col[ptr:ptr + n]): if cols[c]: continue cols[c] = True yield rr, c else: raise NotImplementedError( "The iterator for this sparse matrix has not been implemented")
def transform(dayuserwords, userregionmap, ndays): """ userwords - a matrix containing a user per row grouped by dayuserwords userregionmap - a dictionary of user index to region ndays - number of days in userwords returns: regiondayuserword - sparse matrix containing users grouped by days grouped by region regiondayworduser - sparse matrix containing words grouped by days grouped by region """ regionusermap = dict([(x-1,[]) for x in set(userregionmap.values())]) for user,region in userregionmap.items(): regionusermap[region-1] += [user] N = ndays U = dayuserwords.shape[0]/N W = dayuserwords.shape[1] R = len(regionusermap) missing_users = array(list(set(range(U)) - set(userregionmap.keys()))) logger.debug("Preparing Output Matrices") regiondayuserword = None regiondayworduser = None # dayuserwords_r = ssp.csr_matrix(dayuserwords) if not ssp.isspmatrix_lil(dayuserwords): logger.debug("The data array must be lil, transforming...") dayuserwords = dayuserwords.tolil() logger.debug("Filling (R x D x U, W) matrix") rows = [] data = [] for r in range(R): logger.debug("Starting region: %d"%r) rusers = set(regionusermap[r]) for n in range(N): logger.debug("Starting day: %d"%n) for u in range(U): if u not in rusers: rows += [[]] data += [[]] else: i = n * U + u rows += [dayuserwords.rows[i]] data += [dayuserwords.data[i]] regiondayuserword = ssp.lil_matrix((1,1),dtype=dayuserwords.dtype) regiondayuserword.data = data regiondayuserword.rows = rows regiondayuserword._shape = (R * N * U, W) logger.debug("... cleaning up Filling (R x D x U, W) matrix") regiondayuserword = ssp.csr_matrix(regiondayuserword) logger.debug("Filling (R x D x W, U) matrix") rduw_x = regiondayuserword[:U,:] regiondayworduser = ssp.coo_matrix(rduw_x.T) for x in xrange(1, R * N): rduw_x = regiondayuserword[x*U:(x+1)*U,:] regiondayworduser = ssp.vstack( (regiondayworduser, rduw_x.T) ) logger.debug("... cleaning up Filling (R x D x W, U) matrix") regiondayworduser = ssp.csr_matrix(regiondayworduser) return regiondayuserword,regiondayworduser
def ward_tree(X, connectivity=None, n_components=None, copy=True, n_clusters=None): """Ward clustering based on a Feature matrix. Recursively merges the pair of clusters that minimally increases within-cluster variance. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Parameters ---------- X : array of shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix. connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. copy : bool (optional) Make a copy of connectivity or work inplace. If connectivity is not of LIL type there will be a copy in any case. n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. Returns ------- children : 2D array, shape (n_nodes, 2) The children of each non-leaf node. Values less than `n_samples` refer to leaves of the tree. A greater value `i` indicates a node with children `children[i - n_samples]`. n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape if connectivity is None: if n_clusters is not None: warnings.warn('Early stopping is implemented only for ' 'structured Ward clustering (i.e. with ' 'explicit connectivity.', stacklevel=2) out = hierarchy.ward(X) children_ = out[:, :2].astype(np.int) return children_, 1, n_samples, None # Compute the number of nodes if n_components is None: n_components, labels = cs_graph_components(connectivity) # Convert connectivity matrix to LIL with a copy if needed if sparse.isspmatrix_lil(connectivity) and copy: connectivity = connectivity.copy() elif not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() if n_components > 1: warnings.warn("the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_components) connectivity = _fix_connectivity(X, connectivity, n_components, labels) n_components = 1 if n_clusters is None: n_nodes = 2 * n_samples - n_components else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters if (connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples): raise ValueError('Wrong shape for connectivity matrix: %s ' 'when X is %s' % (connectivity.shape, X.shape)) # create inertia matrix coord_row = [] coord_col = [] A = [] for ind, row in enumerate(connectivity.rows): A.append(row) # We keep only the upper triangular for the moments # Generator expressions are faster than arrays on the following row = [i for i in row if i < ind] coord_row.extend(len(row) * [ind, ]) coord_col.extend(row) coord_row = np.array(coord_row, dtype=np.int) coord_col = np.array(coord_col, dtype=np.int) # build moments as a list moments_1 = np.zeros(n_nodes) moments_1[:n_samples] = 1 moments_2 = np.zeros((n_nodes, n_features)) moments_2[:n_samples] = X inertia = np.empty(len(coord_row), dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia) inertia = list(six.moves.zip(inertia, coord_row, coord_col)) heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.int) heights = np.zeros(n_nodes) used_node = np.ones(n_nodes, dtype=bool) children = [] not_visited = np.empty(n_nodes, dtype=np.int8) # recursive merge loop for k in range(n_samples, n_nodes): # identify the merge while True: inert, i, j = heappop(inertia) if used_node[i] and used_node[j]: break parent[i], parent[j], heights[k] = k, k, inert children.append([i, j]) used_node[i] = used_node[j] = False # update the moments moments_1[k] = moments_1[i] + moments_1[j] moments_2[k] = moments_2[i] + moments_2[j] # update the structure matrix A and the inertia matrix coord_col = [] not_visited.fill(1) not_visited[k] = 0 _hierarchical._get_parents(A[i], coord_col, parent, not_visited) _hierarchical._get_parents(A[j], coord_col, parent, not_visited) # List comprehension is faster than a for loop [A[l].append(k) for l in coord_col] A.append(coord_col) coord_col = np.array(coord_col, dtype=np.int) coord_row = np.empty_like(coord_col) coord_row.fill(k) n_additions = len(coord_row) ini = np.empty(n_additions, dtype=np.float) _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini) # List comprehension is faster than a for loop [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)] # Separate leaves in children (empty lists up to now) n_leaves = n_samples children = np.array(children) # return numpy array for efficient caching return children, n_components, n_leaves, parent
def add(self, other, in_place=True, write_to_self=False): """ Add a matrix. The sum of self._raw_matrix with the passed StateMatrix (other). Args: other: another StateMatrix object of the same type as this object in_place: If True, matrix addition is applied (in-place) to (self) If False, a new copy will be returned. Returns: The sum of self with the passed StateMatrix (other). """ if write_to_self: # update the reference matrix inside this object. if not in_place: result_mat = self.copy() else: result_mat = self if isinstance(other, (StateMatrixNumpy, self.__class__)): source_matrix = other source_matrix_ref = other._raw_matrix elif isinstance(other, np.ndarray): source_matrix = other source_matrix_ref = other else: raise TypeError( "matrix has to be either 'StateMatrixNumpy', or 'StateMatrixSpSciPy', or 'np.ndarray' " ) else: # the target is the input matrix or a copy of it if not in_place: result_mat = other.copy() else: result_mat = other source_matrix = self source_matrix_ref = self._raw_matrix # # Check the result matrix format if isinstance(result_mat, self.__class__): result_mat_ref = result_mat.get_raw_matrix_ref() # # frmt = result_mat._raw_matrix.getformat() # print('\n xxxxxxxxxxxxxxxxx \n %s \n xxxxxxxxxxxxxxxxx \n' % frmt) # if sparse.isspmatrix_bsr(result_mat._raw_matrix): result_mat_ref = sparse.bsr_matrix(result_mat_ref + source_matrix_ref) elif sparse.isspmatrix_coo(result_mat._raw_matrix): result_mat_ref = sparse.coo_matrix(result_mat_ref + source_matrix_ref) elif sparse.isspmatrix_csc(result_mat._raw_matrix): result_mat_ref = sparse.csc_matrix(result_mat_ref + source_matrix_ref) elif sparse.isspmatrix_csr(result_mat._raw_matrix): result_mat_ref = sparse.csr_matrix(result_mat_ref + source_matrix_ref) # print(result_mat._raw_matrix) # print("is sparse: ", sparse.issparse(result_mat._raw_matrix)) elif sparse.isspmatrix_dia(result_mat._raw_matrix): result_mat_ref = sparse.dia_matrix(result_mat_ref + source_matrix_ref) elif sparse.isspmatrix_dok(result_mat._raw_matrix): result_mat_ref = sparse.dok_matrix(result_mat_ref + source_matrix_ref) elif sparse.isspmatrix_lil(result_mat._raw_matrix): result_mat_ref = sparse.lil_matrix(result_mat_ref + source_matrix_ref) else: raise TypeError( "Unsupported Format! My format has been tapered with!") result_mat.set_raw_matrix_ref(result_mat_ref) result_mat._update_attributes() elif isinstance(result_mat, StateMatrixNumpy): result_mat_ref = result_mat.get_raw_matrix_ref() if isinstance(source_matrix, self.__class__): result_mat_ref = result_mat_ref + source_matrix_ref try: result_mat_ref = result_mat_ref.toarray() except AttributeError: result_mat_ref = np.asarray(result_mat_ref) elif isinstance(source_matrix, (np.ndarray, StateMatrixNumpy)): result_mat_ref = result_mat_ref + source_matrix_ref result_mat.set_raw_matrix_ref(result_mat_ref) elif isinstance(result_mat, np.ndarray): result_mat_ref = result_mat if isinstance(source_matrix, self.__class__): result_mat_ref = result_mat_ref + source_matrix_ref try: result_mat_ref = result_mat_ref.toarray() except AttributeError: result_mat_ref = np.asarray(result_mat_ref) elif isinstance(source_matrix, (np.ndarray, StateMatrixNumpy)): result_mat_ref = result_mat_ref + source_matrix_ref else: type.mro(type(other)) print(type.mro(type(other))) print(other) raise TypeError( "matrix has to be either 'StateMatrixNumpy', or 'StateMatrixSpSciPy', or 'np.ndarray' " ) # raise TypeError("matrix has to be either 'StateMatrixNumpy', or 'StateMatrixSpSciPy'! ") # return result_mat
def is_list_of_lil(z): return isinstance(z, list) and sparse.isspmatrix_lil(z[0])
def is_lil(z): return sparse.isspmatrix_lil(z)