Ejemplo n.º 1
0
def _search(con, quiet=True, select_threshold=SELECT_THRESHOLD, blacklist=None):
    """
    Search the sequences in a file against a reference database
    """
    blacklist = blacklist or set()
    p = load_params(con)

    cursor = con.cursor()
    count = 0
    ref_name = p['ref_fasta']
    with open(p['ref_meta']) as fp:
        cluster_info = _load_cluster_info(fp, p['group_field'])

    @memoize
    def add_hit(hit_name, clusterj):
        ins = "INSERT INTO ref_seqs(name, cluster_name) VALUES (?, ?)"
        cursor.execute(ins, [hit_name, cluster])
        return cursor.lastrowid

    @memoize
    def get_seq_id(name):
        cursor.execute('SELECT sequence_id FROM sequences WHERE name = ?', [name])
        return cursor.fetchone()[0]

    with _ntf(prefix='usearch') as uc_fp:
        uclust.search(ref_name, p['fasta_file'], uc_fp.name, pct_id=0.9,
                trunclabels=True, maxaccepts=p['maxaccepts'],
                maxrejects=p['maxrejects'], quiet=quiet)

        records = uclust.parse_uclust_out(uc_fp)
        records = (i for i in records
                   if i.type == 'H' and i.pct_id >= p['search_id'] * 100.0)
        by_seq = uclust.hits_by_sequence(records)
        by_seq = select_hits(by_seq, select_threshold)

        sql = """
INSERT INTO best_hits (sequence_id, hit_idx, ref_id, pct_id)
VALUES (?, ?, ?, ?)
"""
        for _, hits in by_seq:
            # Drop clusters from blacklist
            hits = (h for h in hits if not cluster_info[h.target_label] in blacklist)
            seen_clusters = set()
            for i, h in enumerate(hits):
                cluster = cluster_info[h.target_label]

                # Only keep one sequence per cluster
                if cluster in seen_clusters:
                    continue
                else:
                    seen_clusters.add(cluster)

                # Hit id
                hit_id = add_hit(h.target_label, cluster)
                cursor.execute(sql, [get_seq_id(h.query_label), i, hit_id, h.pct_id])
                count += 1

    return count
Ejemplo n.º 2
0
def cluster_identify_redundant(named_sequence_file, named_ids, to_cluster, threshold=0.97):
    with util.ntf(suffix=".uc", prefix="to_cluster") as tf:
        # Search with uclust
        uclust.search(named_sequence_file, to_cluster, tf.name, pct_id=0.80, maxaccepts=5, maxrejects=100)

        # Uclust.search renames to tf, need a new handle.
        records = uclust.parse_uclust_out(tf)
        hits = (i.query_label for i in records if i.type == "H" and i.pct_id >= threshold * 100.0)

        return frozenset(hits)
Ejemplo n.º 3
0
def identify_otus_unnamed(seq_file, cluster_similarity):
    """
    Generates sequence ids in a cluster

    Identify sequences in OTUs at the given cluster similarity;
    """
    logging.info("Running UCLUST on unnamed sequences at %f", cluster_similarity)
    with util.ntf(prefix="uclust") as tf:
        # Sort and cluster
        uclust.cluster(seq_file, tf.name, pct_id=cluster_similarity, quiet=True)
        clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf))
        for _, sequences in clusters:
            yield [i.query_label for i in sequences]
Ejemplo n.º 4
0
def identify_otus_unnamed(seq_file, cluster_similarity):
    """
    Generates sequence ids in a cluster

    Identify sequences in OTUs at the given cluster similarity;
    """
    logging.info('Running UCLUST on unnamed sequences at %f',
                 cluster_similarity)
    with util.ntf(prefix='uclust') as tf:
        # Sort and cluster
        uclust.cluster(
            seq_file, tf.name, pct_id=cluster_similarity, quiet=True)
        clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf))
        for _, sequences in clusters:
            yield [i.query_label for i in sequences]
Ejemplo n.º 5
0
def cluster_identify_redundant(named_sequence_file, named_ids, to_cluster,
        threshold=0.97):
    with util.ntf(suffix='.uc', prefix='to_cluster') as tf:
        # Search with uclust
        uclust.search(named_sequence_file, to_cluster, tf.name,
                pct_id=0.80,
                maxaccepts=5,
                maxrejects=100)

        # Uclust.search renames to tf, need a new handle.
        records = uclust.parse_uclust_out(tf)
        hits = (i.query_label for i in records
                if i.type == 'H' and i.pct_id >= threshold * 100.0)

        return frozenset(hits)
Ejemplo n.º 6
0
def uclust_search(query, db, **kwargs):
    with util.ntf(prefix='uclust') as tf:
        uclust.search(db, query, tf.name, **kwargs)
        lines = (i for i in tf if i.startswith('H'))
        for i in uclust.parse_uclust_out(lines):
            yield i
Ejemplo n.º 7
0
def _search(con,
            quiet=True,
            select_threshold=SELECT_THRESHOLD,
            search_threshold=SEARCH_THRESHOLD,
            blacklist=None):
    """
    Search the sequences in a file against a reference database
    """
    blacklist = blacklist or set()
    p = load_params(con)

    cursor = con.cursor()
    count = 0
    ref_name = p['ref_fasta']
    with open(p['ref_meta']) as fp:
        cluster_info = _load_cluster_info(fp, p['group_field'])

    @memoize
    def add_hit(hit_name, clusterj):
        ins = 'INSERT INTO ref_seqs(name, cluster_name) VALUES (?, ?)'
        logging.debug(ins.replace('?', '{}').format(hit_name, cluster))
        cursor.execute(ins, [hit_name, cluster])
        return cursor.lastrowid

    @memoize
    def get_seq_id(name):
        sql = 'SELECT sequence_id FROM sequences WHERE name = ?'
        logging.debug(sql.replace('?', '{}').format(name))
        cursor.execute(sql, [name])
        return cursor.fetchone()[0]

    with _ntf(prefix='usearch') as uc_fp:
        uclust.search(ref_name,
                      p['fasta_file'],
                      uc_fp.name,
                      pct_id=search_threshold,
                      maxaccepts=p['maxaccepts'],
                      maxrejects=p['maxrejects'],
                      quiet=quiet)

        # import shutil
        # shutil.copy(uc_fp.name, '.')

        records = uclust.parse_uclust_out(uc_fp)
        records = (
            i for i in records
            if i.type == 'H' and i.pct_id >= p['search_identity'] * 100.0)
        by_seq = uclust.hits_by_sequence(records)
        by_seq = select_hits(by_seq, select_threshold)

        sql = """
INSERT INTO best_hits (sequence_id, hit_idx, ref_id, pct_id)
VALUES (?, ?, ?, ?)
"""
        for _, hits in by_seq:
            # Drop clusters from blacklist
            hits = (h for h in hits
                    if not cluster_info[h.target_label] in blacklist)
            seen_clusters = set()
            for i, h in enumerate(hits):
                cluster = cluster_info[h.target_label]

                # Only keep one sequence per cluster
                if cluster in seen_clusters:
                    continue
                else:
                    seen_clusters.add(cluster)

                # Hit id
                hit_id = add_hit(h.target_label, cluster)
                seq_id = get_seq_id(h.query_label)
                logging.debug(
                    sql.replace('?', '{}').format(seq_id, i, hit_id, h.pct_id))
                cursor.execute(sql, [seq_id, i, hit_id, h.pct_id])
                count += 1

    return count
Ejemplo n.º 8
0
def uclust_search(query, db, **kwargs):
    with util.ntf(prefix='uclust') as tf:
        uclust.search(db, query, tf.name, **kwargs)
        lines = (i for i in tf if i.startswith('H'))
        for i in uclust.parse_uclust_out(lines):
            yield i