def action(args):
    rp = Refpkg(args.refpkg)
    rp.load_db()
    cursor = rp.db.cursor()
    ranks = args.ranks.split(',')

    with tempfile.NamedTemporaryFile() as tmp_db:
        taxtable_db = Taxdb(sqlite3.connect(tmp_db.name))
        taxtable_db.create_tables()
        reader = csv.DictReader(args.infile)
        taxtable_db.insert_from_taxtable(lambda: reader._fieldnames, reader)
        cursor.execute('ATTACH DATABASE ? AS tt', (tmp_db.name,))

        writer = csv.writer(args.outfile)
        writer.writerow(('tax_id', 'intersection_rank'))
        cursor.execute("""
            SELECT tax_id,
                   COALESCE(itaxa.rank, "")
              FROM tt.taxa
                   LEFT JOIN (SELECT child AS tax_id,
                                     rank
                                FROM tt.parents
                                     JOIN taxa
                                       ON tax_id = parent
                                     JOIN ranks USING (rank)
                               WHERE rank IN (%s)
                               ORDER BY child,
                                        rank_order ASC) itaxa USING (tax_id)
             GROUP BY tax_id
        """ % ', '.join('?' * len(ranks)), ranks)
        writer.writerows(cursor)

    args.outfile.flush()
    test_output(args.infile.name, args.outfile.name, ranks)
Exemple #2
0
def align_and_merge(env, refpkg, qseqs, outdir = None,
                    options = None, nproc = 1):

    """
    Align sequences in ``qseqs`` and merge with the reference alignment.

     * env - Environment instance.
     * refpkg - path to a reference package directory.
     * qseqs - unaligned query sequenecs in fasta format.
     * outdir - optional output directory; saves files to same
       directory as qseqs if unspecified.
     * options - flags for cmalign [default infernal.CMALIGN_FLAGS]
     * nproc - number of processors to use for ``cmalign``.

    Returns (sto, scores, merged)

    Example::

        from bioscons.pplacer import align_and_merge
        env.AddMethod(align_and_merge, "align_and_merge")
        sto, scores, merged = env.align_and_merge(
            refpkg = 'my.refpkg', qseqs = 'myseqs.fasta'
        )
    """

    if not hasattr(env, 'cmalign_method'):
        env.AddMethod(cmalign_method, 'cmalign_method')

    if not hasattr(env, 'cmmerge_method'):
        env.AddMethod(cmmerge_method, 'cmmerge_method')

    pkg = Refpkg(refpkg, create=False)
    profile = pkg.file_abspath('profile')
    ref_sto = pkg.file_abspath('aln_sto')

    # align sequences
    sto, scores = env.cmalign_method(
        profile = profile,
        fasta = qseqs,
        nproc = nproc,
        options = options or CMALIGN_FLAGS,
        outdir = outdir
        )

    # merge with reference set
    merged = env.cmmerge_method(
        profile, ref_sto, sto,
        outname = rename(sto, '_merged.sto'),
        options = options or CMALIGN_FLAGS,
        outdir = outdir
        )

    if outdir and not outdir == '.':
        Clean(merged, Dir(outdir))

    return Flatten([sto, scores, merged])
Exemple #3
0
def action(args):
    rp = Refpkg(args.refpkg, create=False)
    rp.load_db()
    cursor = rp.db.cursor()
    ranks = args.ranks.split(',')

    with tempfile.NamedTemporaryFile() as tmp_db:
        taxtable_db = Taxdb(sqlite3.connect(tmp_db.name))
        taxtable_db.create_tables()
        reader = csv.DictReader(args.infile)
        taxtable_db.insert_from_taxtable(lambda: reader._fieldnames, reader)
        cursor.execute('ATTACH DATABASE ? AS tt', (tmp_db.name, ))

        writer = csv.writer(args.out)
        writer.writerow(('tax_id', 'intersection_rank'))
        cursor.execute(
            """
            SELECT tax_id,
                   COALESCE(itaxa.rank, "")
              FROM tt.taxa
                   LEFT JOIN (SELECT child AS tax_id,
                                     rank_order,
                                     rank
                                FROM tt.parents
                                     JOIN taxa
                                       ON tax_id = parent
                                     JOIN ranks USING (rank)
                               WHERE rank IN (%s)) itaxa USING (tax_id)
             ORDER BY tax_id,
                      rank_order DESC
        """ % ', '.join('?' * len(ranks)), ranks)
        if not args.all_ranks:
            cursor = filter_ranks(cursor)
        writer.writerows(cursor)

    args.out.flush()
    test_output(args.infile.name, args.out.name, ranks)
Exemple #4
0
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS):
    """Context manager yielding a temporary reference package for a
    collection of aligned sequences.

    Builds a tree with FastTree, creates a reference package, yields.

    """
    sequences = list(sequences)
    with ntf(prefix='fasttree-', suffix='.log') as log_fp, \
         ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \
         tempdir(prefix='refpkg') as refpkg_dir:

        log_fp.close()

        fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True,
                 threads=threads)
        tree_fp.close()

        rp = Refpkg(refpkg_dir(name), create=True)
        rp.update_metadata('locus', '')
        rp.update_phylo_model('FastTree', log_fp.name)
        rp.update_file('tree', tree_fp.name)

        # FASTA and Stockholm alignment
        with ntf(suffix='.fasta') as f:
            SeqIO.write(sequences, f, 'fasta')
            f.close()
            rp.update_file('aln_fasta', f.name)
        with ntf(suffix='.sto') as f:
            SeqIO.write(sequences, f, 'stockholm')
            f.close()
            rp.update_file('aln_sto', f.name)
        logging.debug("Reference package written to %s", rp.path)
        yield rp
Exemple #5
0
        workdir = args.workdir
        try:
            os.makedirs(workdir)
        except OSError, e:
            if e.errno != errno.EEXIST:
                raise

    if not args.disable_cleanup:

        @atexit.register
        def cleanup_workdir():
            shutil.rmtree(workdir, ignore_errors=True)

    classif_db = os.path.join(workdir, 'classifications.sqlite')
    index_refpkg = os.path.join(args.hrefpkg, 'index.refpkg')
    index = Refpkg(index_refpkg)
    index_rank = index.metadata('index_rank')
    classif_rank = args.classification_rank or index_rank
    index_counts = os.path.join(args.hrefpkg,
                                'index-%s.counts' % (classif_rank, ))
    log.info('performing initial classification at %s', classif_rank)
    silently_unlink(classif_db)
    logging_check_call(
        [args.rppr, 'prep_db', '--sqlite', classif_db, '-c', index_refpkg])
    logging_check_call([
        args.guppy, 'classify', '--sqlite', classif_db, '-c', index_refpkg,
        '--classifier', 'nbc', '--nbc-rank', classif_rank, '--no-pre-mask',
        '--nbc-sequences', args.query_seqs, '--nbc-counts', index_counts, '-j',
        str(args.ncores)
    ])
Exemple #6
0
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS):
    """Context manager yielding a temporary reference package for a
    collection of aligned sequences.

    Builds a tree with FastTree, creates a reference package, yields.

    """
    sequences = list(sequences)
    with ntf(prefix='fasttree-', suffix='.log') as log_fp, \
         ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \
         tempdir(prefix='refpkg') as refpkg_dir:

        log_fp.close()

        fasttree(sequences,
                 log_path=log_fp.name,
                 output_fp=tree_fp,
                 gtr=True,
                 threads=threads)
        tree_fp.close()

        rp = Refpkg(refpkg_dir(name), create=True)
        rp.update_metadata('locus', '')
        rp.update_phylo_model('FastTree', log_fp.name)
        rp.update_file('tree', tree_fp.name)

        # FASTA and Stockholm alignment
        with ntf(suffix='.fasta') as f:
            SeqIO.write(sequences, f, 'fasta')
            f.close()
            rp.update_file('aln_fasta', f.name)
        with ntf(suffix='.sto') as f:
            SeqIO.write(sequences, f, 'stockholm')
            f.close()
            rp.update_file('aln_sto', f.name)
        logging.debug("Reference package written to %s", rp.path)
        yield rp
Exemple #7
0
def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file,
        output_dir='.',
        index_rank='order', train_file=None, test_file=None):
    """
    Build a reference package containing all descendants of tax_id from an
    index reference package.
    """
    with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \
         util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \
         util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \
         util.ntf(prefix='tree', suffix='.tre') as tree_fp, \
         util.ntf(prefix='tree', suffix='.stats') as stats_fp, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp:

        # Subset taxonomy
        n = full_tax.get_node(tax_id)
        descendants = set(i.tax_id for i in n)
        assert descendants
        n.write_taxtable(tax_fp)
        tax_fp.close()

        # Subset seq_info
        w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(),
                quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        rows = [i for i in seqinfo if i['tax_id'] in descendants]
        sinfo = {i['seqname']: i for i in rows}

        # Choose sequences, divide into train and test sets
        chosen = choose_sequence_ids(n, rows, index_rank=index_rank)
        keep_seq_ids = set()
        train_seq_ids = set()
        test_seq_ids = set()

        for keep, rest in chosen:
            keep_seq_ids |= frozenset(keep)
            l = len(rest)
            if l >= 2 * PER_TAXON:
                train_seq_ids |= frozenset(rest[:l / 2])
                test_seq_ids |= frozenset(rest[l / 2:])

        # Picked
        rows = [sinfo[i] for i in keep_seq_ids]
        w.writerows(rows)
        seq_info_fp.close()

        # Fetch sequences
        with tempfile.NamedTemporaryFile() as tf:
            wrap.esl_sfetch(sequence_file,
                            keep_seq_ids, tf)
            # Rewind
            tf.seek(0)
            sequences = list(SeqIO.parse(tf, 'fasta'))
        logging.info("Tax id %s: %d sequences", tax_id, len(sequences))

        if len(set(str(i.seq) for i in sequences)) == 1:
            logging.warn("Skipping %s: only 1 unique sequence string", tax_id)
            return None

        # No sense in building with one sequence
        if len(sequences) < 2:
            logging.warn("Skipping: %d sequences.", len(sequences))
            return None

        # Extract training & test seqs
        if train_file:
            logging.info("%d training sequences", len(train_seq_ids))
            wrap.esl_sfetch(sequence_file, train_seq_ids, train_file)
        if test_file:
            logging.info("%d test sequences", len(test_seq_ids))
            wrap.esl_sfetch(sequence_file, test_seq_ids, test_file)

        # Cmalign
        aligned = wrap.cmalign(sequences, output=sto_fp)
        aligned = list(aligned)
        assert aligned
        # Tree
        wrap.fasttree(aligned, log_path=stats_fp.name, output_fp=tree_fp, threads=1, gtr=True)
        tree_fp.close()
        sto_fp.close()
        SeqIO.write(aligned, fasta_fp, 'fasta')
        fasta_fp.close()

        rp = Refpkg(os.path.join(output_dir, tax_id + '.refpkg'), create=True)
        rp.start_transaction()
        rp.update_file('aln_sto', sto_fp.name)
        rp.update_file('aln_fasta', fasta_fp.name)
        rp.update_file('tree', tree_fp.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        try:
            rp.update_phylo_model('FastTree', stats_fp.name)
        except:
            print >> sys.stderr, stats_fp.read()
            raise
        rp.update_file('profile', wrap.CM)
        rp.commit_transaction()

        util.require_executable('rppr')
        rp.reroot()

        return rp.path
Exemple #8
0
def build_index_refpkg(hrefpkg_paths, sequence_file, seqinfo, taxonomy,
        dest='index.refpkg', **meta):
    """
    Build an index.refpkg from a set of hrefpkgs
    """

    # Clear taxonomy
    taxonomy = copy.deepcopy(taxonomy)
    for node in taxonomy:
        node.sequence_ids = set()

    def sequence_names(f):
        with open(f) as fp:
            r = csv.DictReader(fp)
            for i in r:
                yield i['seqname']

    hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths)
    seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs)

    # Add seqinfo
    for f in seqinfo_files:
        with f:
            taxonomy.populate_from_seqinfo(f)

    # Remove lineages without sequences
    taxonomy.prune_unrepresented()

    sequence_ids = frozenset(taxonomy.subtree_sequence_ids())

    with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \
         util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp:
        wrap.esl_sfetch(sequence_file, sequence_ids, tf)
        tf.close()

        # Seqinfo file
        r = (i for i in seqinfo if i['seqname'] in sequence_ids)
        w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n',
                quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        w.writerows(r)
        seq_info_fp.close()

        taxonomy.write_taxtable(tax_fp)
        tax_fp.close()

        rp = Refpkg(dest, create=True)
        rp.start_transaction()
        rp.update_file('aln_fasta', tf.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        rp.update_file('profile', wrap.CM)

        for k, v in meta.items():
            rp.update_metadata(k, v)

        rp.commit_transaction()

    return rp, sequence_ids
Exemple #9
0
    else:
        workdir = args.workdir
        try:
            os.makedirs(workdir)
        except OSError, e:
            if e.errno != errno.EEXIST:
                raise

    if not args.disable_cleanup:
        @atexit.register
        def cleanup_workdir():
            shutil.rmtree(workdir, ignore_errors=True)

    classif_db = os.path.join(workdir, 'classifications.sqlite')
    index_refpkg = os.path.join(args.hrefpkg, 'index.refpkg')
    index = Refpkg(index_refpkg)
    index_rank = index.metadata('index_rank')
    classif_rank = args.classification_rank or index_rank
    index_counts = os.path.join(args.hrefpkg, 'index-%s.counts' % (classif_rank,))
    log.info('performing initial classification at %s', classif_rank)
    silently_unlink(classif_db)
    logging_check_call(
        [args.rppr, 'prep_db', '--sqlite', classif_db, '-c', index_refpkg])
    logging_check_call(
        [args.guppy, 'classify', '--sqlite', classif_db, '-c', index_refpkg,
         '--classifier', 'nbc', '--nbc-rank', classif_rank, '--no-pre-mask',
         '--nbc-sequences', args.query_seqs, '--nbc-counts', index_counts,
         '-j', str(args.ncores)])

    with open(os.path.join(args.hrefpkg, 'index.csv'), 'rU') as fobj:
        refpkg_map = dict(csv.reader(fobj))
Exemple #10
0
def tax_id_refpkg(tax_id,
                  full_tax,
                  seqinfo,
                  sequence_file,
                  output_dir='.',
                  index_rank='order',
                  train_file=None,
                  test_file=None):
    """
    Build a reference package containing all descendants of tax_id from an
    index reference package.
    """
    with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \
         util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \
         util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \
         util.ntf(prefix='tree', suffix='.tre') as tree_fp, \
         util.ntf(prefix='tree', suffix='.stats') as stats_fp, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp:

        # Subset taxonomy
        n = full_tax.get_node(tax_id)
        descendants = set(i.tax_id for i in n)
        assert descendants
        n.write_taxtable(tax_fp)
        tax_fp.close()

        # Subset seq_info
        w = csv.DictWriter(seq_info_fp,
                           seqinfo[0].keys(),
                           quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        rows = [i for i in seqinfo if i['tax_id'] in descendants]
        sinfo = {i['seqname']: i for i in rows}

        # Choose sequences, divide into train and test sets
        chosen = choose_sequence_ids(n, rows, index_rank=index_rank)
        keep_seq_ids = set()
        train_seq_ids = set()
        test_seq_ids = set()

        for keep, rest in chosen:
            keep_seq_ids |= frozenset(keep)
            l = len(rest)
            if l >= 2 * PER_TAXON:
                train_seq_ids |= frozenset(rest[:l / 2])
                test_seq_ids |= frozenset(rest[l / 2:])

        # Picked
        rows = [sinfo[i] for i in keep_seq_ids]
        w.writerows(rows)
        seq_info_fp.close()

        # Fetch sequences
        with tempfile.NamedTemporaryFile() as tf:
            wrap.esl_sfetch(sequence_file, keep_seq_ids, tf)
            # Rewind
            tf.seek(0)
            sequences = list(SeqIO.parse(tf, 'fasta'))
        logging.info("Tax id %s: %d sequences", tax_id, len(sequences))

        if len(set(str(i.seq) for i in sequences)) == 1:
            logging.warn("Skipping %s: only 1 unique sequence string", tax_id)
            return None

        # No sense in building with one sequence
        if len(sequences) < 2:
            logging.warn("Skipping: %d sequences.", len(sequences))
            return None

        # Extract training & test seqs
        if train_file:
            logging.info("%d training sequences", len(train_seq_ids))
            wrap.esl_sfetch(sequence_file, train_seq_ids, train_file)
        if test_file:
            logging.info("%d test sequences", len(test_seq_ids))
            wrap.esl_sfetch(sequence_file, test_seq_ids, test_file)

        # Cmalign
        aligned = wrap.cmalign(sequences, output=sto_fp)
        aligned = list(aligned)
        assert aligned
        # Tree
        wrap.fasttree(aligned,
                      log_path=stats_fp.name,
                      output_fp=tree_fp,
                      threads=1,
                      gtr=True)
        tree_fp.close()
        sto_fp.close()
        SeqIO.write(aligned, fasta_fp, 'fasta')
        fasta_fp.close()

        rp = Refpkg(os.path.join(output_dir, tax_id + '.refpkg'), create=True)
        rp.start_transaction()
        rp.update_file('aln_sto', sto_fp.name)
        rp.update_file('aln_fasta', fasta_fp.name)
        rp.update_file('tree', tree_fp.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        try:
            rp.update_phylo_model('FastTree', stats_fp.name)
        except:
            print >> sys.stderr, stats_fp.read()
            raise
        rp.update_file('profile', wrap.CM)
        rp.commit_transaction()

        util.require_executable('rppr')
        rp.reroot()

        return rp.path
Exemple #11
0
def build_index_refpkg(hrefpkg_paths,
                       sequence_file,
                       seqinfo,
                       taxonomy,
                       dest='index.refpkg',
                       **meta):
    """
    Build an index.refpkg from a set of hrefpkgs
    """

    # Clear taxonomy
    taxonomy = copy.deepcopy(taxonomy)
    for node in taxonomy:
        node.sequence_ids = set()

    def sequence_names(f):
        with open(f) as fp:
            r = csv.DictReader(fp)
            for i in r:
                yield i['seqname']

    hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths)
    seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs)

    # Add seqinfo
    for f in seqinfo_files:
        with f:
            taxonomy.populate_from_seqinfo(f)

    # Remove lineages without sequences
    taxonomy.prune_unrepresented()

    sequence_ids = frozenset(taxonomy.subtree_sequence_ids())

    with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \
         util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp:
        wrap.esl_sfetch(sequence_file, sequence_ids, tf)
        tf.close()

        # Seqinfo file
        r = (i for i in seqinfo if i['seqname'] in sequence_ids)
        w = csv.DictWriter(seq_info_fp,
                           seqinfo[0].keys(),
                           lineterminator='\n',
                           quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        w.writerows(r)
        seq_info_fp.close()

        taxonomy.write_taxtable(tax_fp)
        tax_fp.close()

        rp = Refpkg(dest, create=True)
        rp.start_transaction()
        rp.update_file('aln_fasta', tf.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        rp.update_file('profile', wrap.CM)

        for k, v in meta.items():
            rp.update_metadata(k, v)

        rp.commit_transaction()

    return rp, sequence_ids