def update_distance_matrices(refList, distMat, queryList=None, query_ref_distMat=None, query_query_distMat=None, threads=1): """Convert distances from long form (1 matrix with n_comparisons rows and 2 columns) to a square form (2 NxN matrices), with merging of query distances if necessary. Args: refList (list) List of references distMat (numpy.array) Two column long form list of core and accessory distances for pairwise comparisons between reference db sequences queryList (list) List of queries query_ref_distMat (numpy.array) Two column long form list of core and accessory distances for pairwise comparisons between queries and reference db sequences query_query_distMat (numpy.array) Two column long form list of core and accessory distances for pairwise comparisons between query sequences threads (int) Number of threads to use Returns: seqLabels (list) Combined list of reference and query sequences coreMat (numpy.array) NxN array of core distances for N sequences accMat (numpy.array) NxN array of accessory distances for N sequences """ seqLabels = refList if queryList is not None: seqLabels = seqLabels + queryList if queryList == None: coreMat = pp_sketchlib.longToSquare(distMat[:, [0]], threads) accMat = pp_sketchlib.longToSquare(distMat[:, [1]], threads) else: coreMat = pp_sketchlib.longToSquareMulti(distMat[:, [0]], query_ref_distMat[:, [0]], query_query_distMat[:, [0]], threads) accMat = pp_sketchlib.longToSquareMulti(distMat[:, [1]], query_ref_distMat[:, [1]], query_query_distMat[:, [1]], threads) # return outputs return seqLabels, coreMat, accMat
def fit(self, X, accessory): '''Extends :func:`~ClusterFit.fit` Gets assignments by using nearest neigbours. Args: X (numpy.array) The core and accessory distances to cluster. Must be set if preprocess is set. accessory (bool) Use accessory rather than core distances Returns: y (numpy.array) Cluster assignments of samples in X ''' ClusterFit.fit(self, X) sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0])))) if (max(self.ranks) >= sample_size): sys.stderr.write("Rank must be less than the number of samples") sys.exit(0) if accessory: self.dist_col = 1 else: self.dist_col = 0 self.nn_dists = {} for rank in self.ranks: row, col, data = \ pp_sketchlib.sparsifyDists( pp_sketchlib.longToSquare(X[:, [self.dist_col]], self.threads), 0, rank ) data = [epsilon if d < epsilon else d for d in data] if self.use_gpu: self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix( (cp.array(data), (cp.array(row), cp.array(col))), shape=(sample_size, sample_size), dtype=X.dtype) else: self.nn_dists[rank] = scipy.sparse.coo_matrix( (data, (row, col)), shape=(sample_size, sample_size), dtype=X.dtype) self.fitted = True y = self.assign(min(self.ranks)) return y
def extend(self, qqDists, qrDists): # Reshape qq and qr dist matrices qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]], self.threads) qqSquare[qqSquare < epsilon] = epsilon n_ref = self.nn_dists[self.ranks[0]].shape[0] n_query = qqSquare.shape[1] qrRect = qrDists[:, [self.dist_col]].reshape(n_query, n_ref) qrRect[qrRect < epsilon] = epsilon for rank in self.ranks: # Add the matrices together to make a large square matrix if self.use_gpu: full_mat = cupyx.scipy.sparse.bmat( [[self.nn_dists[rank], qrRect.transpose()], [qrRect, qqSquare]], format='csr', dtype=self.nn_dists[rank].dtype) else: full_mat = scipy.sparse.bmat( [[self.nn_dists[rank], qrRect.transpose()], [qrRect, qqSquare]], format='csr', dtype=self.nn_dists[rank].dtype) # Reapply the rank to each row, using sparse matrix functions data = [] row = [] col = [] for row_idx in range(full_mat.shape[0]): sample_row = full_mat.getrow(row_idx) if self.use_gpu: dist_row, dist_col, dist = cupyx.scipy.sparse.find( sample_row) else: dist_row, dist_col, dist = scipy.sparse.find(sample_row) dist = [epsilon if d < epsilon else d for d in dist] dist_idx_sort = np.argsort(dist) # Identical to C++ code in matrix_ops.cpp:sparsify_dists neighbours = 0 prev_val = -1 for sort_idx in dist_idx_sort: if row_idx == dist_col[sort_idx]: continue new_val = abs(dist[sort_idx] - prev_val) < epsilon if (neighbours < rank or new_val): data.append(dist[sort_idx]) row.append(row_idx) col.append(dist_col[sort_idx]) if not new_val: neighbours += 1 prev_val = data[-1] else: break if self.use_gpu: self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix( (cp.array(data), (cp.array(row), cp.array(col))), shape=(full_mat.shape[0], full_mat.shape[0]), dtype=self.nn_dists[rank].dtype) else: self.nn_dists[rank] = scipy.sparse.coo_matrix( (data, (row, col)), shape=(full_mat.shape[0], full_mat.shape[0]), dtype=self.nn_dists[rank].dtype) y = self.assign(min(self.ranks)) return y
elif in_tri == 0: boundary_test[row] = 0 return(boundary_test) def check_res(res, expected): if (not np.all(res == expected)): print(res) print(expected) raise RuntimeError("Results don't match") # Square to long rr_mat = np.array([1, 2, 3, 4, 5, 6], dtype=np.float32) qq_mat = np.array([8], dtype=np.float32) qr_mat = np.array([10, 20, 10, 20, 10, 20, 10, 20], dtype=np.float32) square1 = pp_sketchlib.longToSquare(rr_mat, 2) square2 = pp_sketchlib.longToSquareMulti(rr_mat, qr_mat, qq_mat) square1_res = np.array([[0, 1, 2, 3], [1, 0, 4, 5], [2, 4, 0, 6], [3, 5, 6, 0]], dtype=np.float32) square2_res = np.array([[0, 1, 2, 3, 10, 20], [1, 0, 4, 5, 10, 20], [2, 4, 0, 6, 10, 20], [3, 5, 6, 0, 10, 20], [10, 10, 10, 10, 0, 8], [20, 20, 20, 20, 8, 0]], dtype=np.float32)