Exemple #1
0
def update_distance_matrices(refList,
                             distMat,
                             queryList=None,
                             query_ref_distMat=None,
                             query_query_distMat=None,
                             threads=1):
    """Convert distances from long form (1 matrix with n_comparisons rows and 2 columns)
    to a square form (2 NxN matrices), with merging of query distances if necessary.

    Args:
        refList (list)
            List of references
        distMat (numpy.array)
            Two column long form list of core and accessory distances
            for pairwise comparisons between reference db sequences
        queryList (list)
            List of queries
        query_ref_distMat (numpy.array)
            Two column long form list of core and accessory distances
            for pairwise comparisons between queries and reference db
            sequences
        query_query_distMat (numpy.array)
            Two column long form list of core and accessory distances
            for pairwise comparisons between query sequences
        threads (int)
            Number of threads to use

    Returns:
        seqLabels (list)
            Combined list of reference and query sequences
        coreMat (numpy.array)
            NxN array of core distances for N sequences
        accMat (numpy.array)
            NxN array of accessory distances for N sequences
    """
    seqLabels = refList
    if queryList is not None:
        seqLabels = seqLabels + queryList

    if queryList == None:
        coreMat = pp_sketchlib.longToSquare(distMat[:, [0]], threads)
        accMat = pp_sketchlib.longToSquare(distMat[:, [1]], threads)
    else:
        coreMat = pp_sketchlib.longToSquareMulti(distMat[:, [0]],
                                                 query_ref_distMat[:, [0]],
                                                 query_query_distMat[:, [0]],
                                                 threads)
        accMat = pp_sketchlib.longToSquareMulti(distMat[:, [1]],
                                                query_ref_distMat[:, [1]],
                                                query_query_distMat[:, [1]],
                                                threads)

    # return outputs
    return seqLabels, coreMat, accMat
Exemple #2
0
    def fit(self, X, accessory):
        '''Extends :func:`~ClusterFit.fit`

        Gets assignments by using nearest neigbours.

        Args:
            X (numpy.array)
                The core and accessory distances to cluster. Must be set if
                preprocess is set.
            accessory (bool)
                Use accessory rather than core distances

        Returns:
            y (numpy.array)
                Cluster assignments of samples in X
        '''
        ClusterFit.fit(self, X)
        sample_size = int(round(0.5 * (1 + np.sqrt(1 + 8 * X.shape[0]))))
        if (max(self.ranks) >= sample_size):
            sys.stderr.write("Rank must be less than the number of samples")
            sys.exit(0)

        if accessory:
            self.dist_col = 1
        else:
            self.dist_col = 0

        self.nn_dists = {}
        for rank in self.ranks:
            row, col, data = \
                pp_sketchlib.sparsifyDists(
                    pp_sketchlib.longToSquare(X[:, [self.dist_col]], self.threads),
                    0,
                    rank
                )
            data = [epsilon if d < epsilon else d for d in data]
            if self.use_gpu:
                self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix(
                    (cp.array(data), (cp.array(row), cp.array(col))),
                    shape=(sample_size, sample_size),
                    dtype=X.dtype)
            else:
                self.nn_dists[rank] = scipy.sparse.coo_matrix(
                    (data, (row, col)),
                    shape=(sample_size, sample_size),
                    dtype=X.dtype)

        self.fitted = True

        y = self.assign(min(self.ranks))
        return y
Exemple #3
0
    def extend(self, qqDists, qrDists):
        # Reshape qq and qr dist matrices
        qqSquare = pp_sketchlib.longToSquare(qqDists[:, [self.dist_col]],
                                             self.threads)
        qqSquare[qqSquare < epsilon] = epsilon

        n_ref = self.nn_dists[self.ranks[0]].shape[0]
        n_query = qqSquare.shape[1]
        qrRect = qrDists[:, [self.dist_col]].reshape(n_query, n_ref)
        qrRect[qrRect < epsilon] = epsilon

        for rank in self.ranks:
            # Add the matrices together to make a large square matrix
            if self.use_gpu:
                full_mat = cupyx.scipy.sparse.bmat(
                    [[self.nn_dists[rank],
                      qrRect.transpose()], [qrRect, qqSquare]],
                    format='csr',
                    dtype=self.nn_dists[rank].dtype)
            else:
                full_mat = scipy.sparse.bmat(
                    [[self.nn_dists[rank],
                      qrRect.transpose()], [qrRect, qqSquare]],
                    format='csr',
                    dtype=self.nn_dists[rank].dtype)

            # Reapply the rank to each row, using sparse matrix functions
            data = []
            row = []
            col = []
            for row_idx in range(full_mat.shape[0]):
                sample_row = full_mat.getrow(row_idx)
                if self.use_gpu:
                    dist_row, dist_col, dist = cupyx.scipy.sparse.find(
                        sample_row)
                else:
                    dist_row, dist_col, dist = scipy.sparse.find(sample_row)
                dist = [epsilon if d < epsilon else d for d in dist]
                dist_idx_sort = np.argsort(dist)

                # Identical to C++ code in matrix_ops.cpp:sparsify_dists
                neighbours = 0
                prev_val = -1
                for sort_idx in dist_idx_sort:
                    if row_idx == dist_col[sort_idx]:
                        continue
                    new_val = abs(dist[sort_idx] - prev_val) < epsilon
                    if (neighbours < rank or new_val):
                        data.append(dist[sort_idx])
                        row.append(row_idx)
                        col.append(dist_col[sort_idx])

                        if not new_val:
                            neighbours += 1
                            prev_val = data[-1]
                    else:
                        break

            if self.use_gpu:
                self.nn_dists[rank] = cupyx.scipy.sparse.coo_matrix(
                    (cp.array(data), (cp.array(row), cp.array(col))),
                    shape=(full_mat.shape[0], full_mat.shape[0]),
                    dtype=self.nn_dists[rank].dtype)
            else:
                self.nn_dists[rank] = scipy.sparse.coo_matrix(
                    (data, (row, col)),
                    shape=(full_mat.shape[0], full_mat.shape[0]),
                    dtype=self.nn_dists[rank].dtype)

        y = self.assign(min(self.ranks))
        return y
Exemple #4
0
        elif in_tri == 0:
            boundary_test[row] = 0
    return(boundary_test)

def check_res(res, expected):
    if (not np.all(res == expected)):
        print(res)
        print(expected)
        raise RuntimeError("Results don't match")

# Square to long
rr_mat = np.array([1, 2, 3, 4, 5, 6], dtype=np.float32)
qq_mat = np.array([8], dtype=np.float32)
qr_mat = np.array([10, 20, 10, 20, 10, 20, 10, 20], dtype=np.float32)

square1 = pp_sketchlib.longToSquare(rr_mat, 2)
square2 = pp_sketchlib.longToSquareMulti(rr_mat, qr_mat, qq_mat)

square1_res = np.array([[0, 1, 2, 3],
                        [1, 0, 4, 5],
                        [2, 4, 0, 6],
                        [3, 5, 6, 0]], dtype=np.float32)


square2_res = np.array([[0, 1, 2, 3, 10, 20],
                        [1, 0, 4, 5, 10, 20],
                        [2, 4, 0, 6, 10, 20],
                        [3, 5, 6, 0, 10, 20],
                        [10, 10, 10, 10, 0, 8],
                        [20, 20, 20, 20, 8, 0]], dtype=np.float32)