def setUp(self): # Set a random seed so hash functions are always the same random.seed(0) kmer_size = 3 self.family = lsh.MinHashFamily(kmer_size) self.dist_thres = 0.5 def f(a, b): a_kmers = [ a[i:(i + kmer_size)] for i in range(len(a) - kmer_size + 1) ] b_kmers = [ b[i:(i + kmer_size)] for i in range(len(b) - kmer_size + 1) ] a_kmers = set(a_kmers) b_kmers = set(b_kmers) jaccard_sim = float( len(a_kmers & b_kmers)) / len(a_kmers | b_kmers) return 1.0 - jaccard_sim self.dist_fn = f
def __init__(self, dist_thres, kmer_size=10): """ Args: dist_thres: only call two probes near-duplicates if their Jaccard distance (1 minus Jaccard similarity) is within this value; the Jaccard similarity is measured by treating each probe sequence as a set of k-mers and measuring the overlap of those k-mers kmer_size: the length of each k-mer to use with MinHash; note that this is *not* the same as self.k """ super().__init__(k=3) self.lsh_family = lsh.MinHashFamily(kmer_size) self.dist_thres = dist_thres def jaccard_dist(a, b): a_kmers = [a[i:(i + kmer_size)] for i in range(len(a) - kmer_size + 1)] b_kmers = [b[i:(i + kmer_size)] for i in range(len(b) - kmer_size + 1)] a_kmers = set(a_kmers) b_kmers = set(b_kmers) jaccard_sim = float(len(a_kmers & b_kmers)) / len(a_kmers | b_kmers) return 1.0 - jaccard_sim self.dist_fn = jaccard_dist
def setUp(self): # Set a random sseed so hash functions are always the same random.seed(0) self.family = lsh.MinHashFamily(3)
def cluster_with_minhash_signatures(seqs, k=12, N=100, threshold=0.1): """Cluster sequences based on their MinHash signatures. Args: seqs: dict mapping sequence header to sequences k: k-mer size to use for k-mer hashes (smaller is likely more sensitive for divergent genomes, but may lead to false positives in determining which genomes are close) N: number of hash values to use in a signature (higher is slower for clustering, but likely more sensitive for divergent genomes) threshold: maximum inter-cluster distance to merge clusters, in average nucleotide dissimilarity (1-ANI, where ANI is average nucleotide identity); higher results in fewer clusters Returns: list c such that c[i] gives a collection of sequence headers in the same cluster, and the clusters in c are sorted in descending order of size """ num_seqs = len(seqs) logger.info(("Producing signatures of %d sequences"), num_seqs) family = lsh.MinHashFamily(k, N=N) signatures_map = make_signatures_with_minhash(family, seqs) # Map each sequence header to an index (0-based), and get # the signature for the corresponding index seq_headers = [] signatures = [] for name, seq in seqs.items(): seq_headers += [name] signatures += [signatures_map[name]] # Eq. 4 of the Mash paper (Ondov et al. 2016) shows that the # Mash distance, which is shown to be closely related to 1-ANI, is: # D = (-1/k) * ln(2*j/(1+j)) # where j is a Jaccard similarity. Solving for j: # j = 1/(2*exp(k*D) - 1) # So, for a desired distance D in terms of 1-ANI, the corresponding # Jaccard distance is: # 1.0 - 1/(2*exp(k*D) - 1) # We can use this to calculate a clustering threshold in terms of # Jaccard distance jaccard_dist_threshold = 1.0 - 1.0 / (2.0 * np.exp(k * threshold) - 1) def jaccard_dist(i, j): # Return estimated Jaccard dist between signatures at # index i and index j return family.estimate_jaccard_dist(signatures[i], signatures[j]) logger.info(("Creating condensed distance matrix of %d sequences"), num_seqs) dist_matrix = create_condensed_dist_matrix(num_seqs, jaccard_dist) logger.info( ("Clustering %d sequences at Jaccard distance threshold of %f"), num_seqs, jaccard_dist_threshold) clusters = cluster_from_dist_matrix(dist_matrix, jaccard_dist_threshold) seqs_in_cluster = [] for cluster_idxs in clusters: seqs_in_cluster += [[seq_headers[i] for i in cluster_idxs]] return seqs_in_cluster