def _search(con, quiet=True, select_threshold=SELECT_THRESHOLD, blacklist=None): """ Search the sequences in a file against a reference database """ blacklist = blacklist or set() p = load_params(con) cursor = con.cursor() count = 0 ref_name = p['ref_fasta'] with open(p['ref_meta']) as fp: cluster_info = _load_cluster_info(fp, p['group_field']) @memoize def add_hit(hit_name, clusterj): ins = "INSERT INTO ref_seqs(name, cluster_name) VALUES (?, ?)" cursor.execute(ins, [hit_name, cluster]) return cursor.lastrowid @memoize def get_seq_id(name): cursor.execute('SELECT sequence_id FROM sequences WHERE name = ?', [name]) return cursor.fetchone()[0] with _ntf(prefix='usearch') as uc_fp: uclust.search(ref_name, p['fasta_file'], uc_fp.name, pct_id=0.9, trunclabels=True, maxaccepts=p['maxaccepts'], maxrejects=p['maxrejects'], quiet=quiet) records = uclust.parse_uclust_out(uc_fp) records = (i for i in records if i.type == 'H' and i.pct_id >= p['search_id'] * 100.0) by_seq = uclust.hits_by_sequence(records) by_seq = select_hits(by_seq, select_threshold) sql = """ INSERT INTO best_hits (sequence_id, hit_idx, ref_id, pct_id) VALUES (?, ?, ?, ?) """ for _, hits in by_seq: # Drop clusters from blacklist hits = (h for h in hits if not cluster_info[h.target_label] in blacklist) seen_clusters = set() for i, h in enumerate(hits): cluster = cluster_info[h.target_label] # Only keep one sequence per cluster if cluster in seen_clusters: continue else: seen_clusters.add(cluster) # Hit id hit_id = add_hit(h.target_label, cluster) cursor.execute(sql, [get_seq_id(h.query_label), i, hit_id, h.pct_id]) count += 1 return count
def cluster_identify_redundant(named_sequence_file, named_ids, to_cluster, threshold=0.97): with util.ntf(suffix=".uc", prefix="to_cluster") as tf: # Search with uclust uclust.search(named_sequence_file, to_cluster, tf.name, pct_id=0.80, maxaccepts=5, maxrejects=100) # Uclust.search renames to tf, need a new handle. records = uclust.parse_uclust_out(tf) hits = (i.query_label for i in records if i.type == "H" and i.pct_id >= threshold * 100.0) return frozenset(hits)
def identify_otus_unnamed(seq_file, cluster_similarity): """ Generates sequence ids in a cluster Identify sequences in OTUs at the given cluster similarity; """ logging.info("Running UCLUST on unnamed sequences at %f", cluster_similarity) with util.ntf(prefix="uclust") as tf: # Sort and cluster uclust.cluster(seq_file, tf.name, pct_id=cluster_similarity, quiet=True) clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf)) for _, sequences in clusters: yield [i.query_label for i in sequences]
def identify_otus_unnamed(seq_file, cluster_similarity): """ Generates sequence ids in a cluster Identify sequences in OTUs at the given cluster similarity; """ logging.info('Running UCLUST on unnamed sequences at %f', cluster_similarity) with util.ntf(prefix='uclust') as tf: # Sort and cluster uclust.cluster( seq_file, tf.name, pct_id=cluster_similarity, quiet=True) clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf)) for _, sequences in clusters: yield [i.query_label for i in sequences]
def cluster_identify_redundant(named_sequence_file, named_ids, to_cluster, threshold=0.97): with util.ntf(suffix='.uc', prefix='to_cluster') as tf: # Search with uclust uclust.search(named_sequence_file, to_cluster, tf.name, pct_id=0.80, maxaccepts=5, maxrejects=100) # Uclust.search renames to tf, need a new handle. records = uclust.parse_uclust_out(tf) hits = (i.query_label for i in records if i.type == 'H' and i.pct_id >= threshold * 100.0) return frozenset(hits)
def uclust_search(query, db, **kwargs): with util.ntf(prefix='uclust') as tf: uclust.search(db, query, tf.name, **kwargs) lines = (i for i in tf if i.startswith('H')) for i in uclust.parse_uclust_out(lines): yield i
def _search(con, quiet=True, select_threshold=SELECT_THRESHOLD, search_threshold=SEARCH_THRESHOLD, blacklist=None): """ Search the sequences in a file against a reference database """ blacklist = blacklist or set() p = load_params(con) cursor = con.cursor() count = 0 ref_name = p['ref_fasta'] with open(p['ref_meta']) as fp: cluster_info = _load_cluster_info(fp, p['group_field']) @memoize def add_hit(hit_name, clusterj): ins = 'INSERT INTO ref_seqs(name, cluster_name) VALUES (?, ?)' logging.debug(ins.replace('?', '{}').format(hit_name, cluster)) cursor.execute(ins, [hit_name, cluster]) return cursor.lastrowid @memoize def get_seq_id(name): sql = 'SELECT sequence_id FROM sequences WHERE name = ?' logging.debug(sql.replace('?', '{}').format(name)) cursor.execute(sql, [name]) return cursor.fetchone()[0] with _ntf(prefix='usearch') as uc_fp: uclust.search(ref_name, p['fasta_file'], uc_fp.name, pct_id=search_threshold, maxaccepts=p['maxaccepts'], maxrejects=p['maxrejects'], quiet=quiet) # import shutil # shutil.copy(uc_fp.name, '.') records = uclust.parse_uclust_out(uc_fp) records = ( i for i in records if i.type == 'H' and i.pct_id >= p['search_identity'] * 100.0) by_seq = uclust.hits_by_sequence(records) by_seq = select_hits(by_seq, select_threshold) sql = """ INSERT INTO best_hits (sequence_id, hit_idx, ref_id, pct_id) VALUES (?, ?, ?, ?) """ for _, hits in by_seq: # Drop clusters from blacklist hits = (h for h in hits if not cluster_info[h.target_label] in blacklist) seen_clusters = set() for i, h in enumerate(hits): cluster = cluster_info[h.target_label] # Only keep one sequence per cluster if cluster in seen_clusters: continue else: seen_clusters.add(cluster) # Hit id hit_id = add_hit(h.target_label, cluster) seq_id = get_seq_id(h.query_label) logging.debug( sql.replace('?', '{}').format(seq_id, i, hit_id, h.pct_id)) cursor.execute(sql, [seq_id, i, hit_id, h.pct_id]) count += 1 return count