Esempio n. 1
0
    def setUp(self):
        # Set a random seed so hash functions are always the same
        random.seed(0)

        kmer_size = 3
        self.family = lsh.MinHashFamily(kmer_size)
        self.dist_thres = 0.5

        def f(a, b):
            a_kmers = [
                a[i:(i + kmer_size)] for i in range(len(a) - kmer_size + 1)
            ]
            b_kmers = [
                b[i:(i + kmer_size)] for i in range(len(b) - kmer_size + 1)
            ]
            a_kmers = set(a_kmers)
            b_kmers = set(b_kmers)
            jaccard_sim = float(
                len(a_kmers & b_kmers)) / len(a_kmers | b_kmers)
            return 1.0 - jaccard_sim

        self.dist_fn = f
Esempio n. 2
0
    def __init__(self, dist_thres, kmer_size=10):
        """
        Args:
            dist_thres: only call two probes near-duplicates if their
                Jaccard distance (1 minus Jaccard similarity) is within
                this value; the Jaccard similarity is measured by treating
                each probe sequence as a set of k-mers and measuring
                the overlap of those k-mers
            kmer_size: the length of each k-mer to use with MinHash; note
                that this is *not* the same as self.k
        """
        super().__init__(k=3)
        self.lsh_family = lsh.MinHashFamily(kmer_size)
        self.dist_thres = dist_thres

        def jaccard_dist(a, b):
            a_kmers = [a[i:(i + kmer_size)] for i in range(len(a) - kmer_size + 1)]
            b_kmers = [b[i:(i + kmer_size)] for i in range(len(b) - kmer_size + 1)]
            a_kmers = set(a_kmers)
            b_kmers = set(b_kmers)
            jaccard_sim = float(len(a_kmers & b_kmers)) / len(a_kmers | b_kmers)
            return 1.0 - jaccard_sim
        self.dist_fn = jaccard_dist
Esempio n. 3
0
    def setUp(self):
        # Set a random sseed so hash functions are always the same
        random.seed(0)

        self.family = lsh.MinHashFamily(3)
Esempio n. 4
0
def cluster_with_minhash_signatures(seqs, k=12, N=100, threshold=0.1):
    """Cluster sequences based on their MinHash signatures.

    Args:
        seqs: dict mapping sequence header to sequences
        k: k-mer size to use for k-mer hashes (smaller is likely more
            sensitive for divergent genomes, but may lead to false positives
            in determining which genomes are close)
        N: number of hash values to use in a signature (higher is slower for
            clustering, but likely more sensitive for divergent genomes)
        threshold: maximum inter-cluster distance to merge clusters, in
            average nucleotide dissimilarity (1-ANI, where ANI is
            average nucleotide identity); higher results in fewer
            clusters

    Returns:
        list c such that c[i] gives a collection of sequence headers
        in the same cluster, and the clusters in c are sorted
        in descending order of size
    """
    num_seqs = len(seqs)

    logger.info(("Producing signatures of %d sequences"), num_seqs)
    family = lsh.MinHashFamily(k, N=N)
    signatures_map = make_signatures_with_minhash(family, seqs)

    # Map each sequence header to an index (0-based), and get
    # the signature for the corresponding index
    seq_headers = []
    signatures = []
    for name, seq in seqs.items():
        seq_headers += [name]
        signatures += [signatures_map[name]]

    # Eq. 4 of the Mash paper (Ondov et al. 2016) shows that the
    # Mash distance, which is shown to be closely related to 1-ANI, is:
    #  D = (-1/k) * ln(2*j/(1+j))
    # where j is a Jaccard similarity. Solving for j:
    #  j = 1/(2*exp(k*D) - 1)
    # So, for a desired distance D in terms of 1-ANI, the corresponding
    # Jaccard distance is:
    #  1.0 - 1/(2*exp(k*D) - 1)
    # We can use this to calculate a clustering threshold in terms of
    # Jaccard distance
    jaccard_dist_threshold = 1.0 - 1.0 / (2.0 * np.exp(k * threshold) - 1)

    def jaccard_dist(i, j):
        # Return estimated Jaccard dist between signatures at
        # index i and index j
        return family.estimate_jaccard_dist(signatures[i], signatures[j])

    logger.info(("Creating condensed distance matrix of %d sequences"),
                num_seqs)
    dist_matrix = create_condensed_dist_matrix(num_seqs, jaccard_dist)
    logger.info(
        ("Clustering %d sequences at Jaccard distance threshold of %f"),
        num_seqs, jaccard_dist_threshold)
    clusters = cluster_from_dist_matrix(dist_matrix, jaccard_dist_threshold)

    seqs_in_cluster = []
    for cluster_idxs in clusters:
        seqs_in_cluster += [[seq_headers[i] for i in cluster_idxs]]
    return seqs_in_cluster