Beispiel #1
0
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS):
    """Context manager yielding a temporary reference package for a
    collection of aligned sequences.

    Builds a tree with FastTree, creates a reference package, yields.

    """
    sequences = list(sequences)
    with ntf(prefix='fasttree-', suffix='.log') as log_fp, \
         ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \
         tempdir(prefix='refpkg') as refpkg_dir:

        log_fp.close()

        fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True,
                 threads=threads)
        tree_fp.close()

        rp = Refpkg(refpkg_dir(name), create=True)
        rp.update_metadata('locus', '')
        rp.update_phylo_model('FastTree', log_fp.name)
        rp.update_file('tree', tree_fp.name)

        # FASTA and Stockholm alignment
        with ntf(suffix='.fasta') as f:
            SeqIO.write(sequences, f, 'fasta')
            f.close()
            rp.update_file('aln_fasta', f.name)
        with ntf(suffix='.sto') as f:
            SeqIO.write(sequences, f, 'stockholm')
            f.close()
            rp.update_file('aln_sto', f.name)
        logging.debug("Reference package written to %s", rp.path)
        yield rp
Beispiel #2
0
def build_index_refpkg(hrefpkg_paths, sequence_file, seqinfo, taxonomy,
        dest='index.refpkg', **meta):
    """
    Build an index.refpkg from a set of hrefpkgs
    """

    # Clear taxonomy
    taxonomy = copy.deepcopy(taxonomy)
    for node in taxonomy:
        node.sequence_ids = set()

    def sequence_names(f):
        with open(f) as fp:
            r = csv.DictReader(fp)
            for i in r:
                yield i['seqname']

    hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths)
    seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs)

    # Add seqinfo
    for f in seqinfo_files:
        with f:
            taxonomy.populate_from_seqinfo(f)

    # Remove lineages without sequences
    taxonomy.prune_unrepresented()

    sequence_ids = frozenset(taxonomy.subtree_sequence_ids())

    with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \
         util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp:
        wrap.esl_sfetch(sequence_file, sequence_ids, tf)
        tf.close()

        # Seqinfo file
        r = (i for i in seqinfo if i['seqname'] in sequence_ids)
        w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n',
                quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        w.writerows(r)
        seq_info_fp.close()

        taxonomy.write_taxtable(tax_fp)
        tax_fp.close()

        rp = Refpkg(dest, create=True)
        rp.start_transaction()
        rp.update_file('aln_fasta', tf.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        rp.update_file('profile', wrap.CM)

        for k, v in meta.items():
            rp.update_metadata(k, v)

        rp.commit_transaction()

    return rp, sequence_ids
Beispiel #3
0
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS):
    """Context manager yielding a temporary reference package for a
    collection of aligned sequences.

    Builds a tree with FastTree, creates a reference package, yields.

    """
    sequences = list(sequences)
    with ntf(prefix='fasttree-', suffix='.log') as log_fp, \
         ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \
         tempdir(prefix='refpkg') as refpkg_dir:

        log_fp.close()

        fasttree(sequences,
                 log_path=log_fp.name,
                 output_fp=tree_fp,
                 gtr=True,
                 threads=threads)
        tree_fp.close()

        rp = Refpkg(refpkg_dir(name), create=True)
        rp.update_metadata('locus', '')
        rp.update_phylo_model('FastTree', log_fp.name)
        rp.update_file('tree', tree_fp.name)

        # FASTA and Stockholm alignment
        with ntf(suffix='.fasta') as f:
            SeqIO.write(sequences, f, 'fasta')
            f.close()
            rp.update_file('aln_fasta', f.name)
        with ntf(suffix='.sto') as f:
            SeqIO.write(sequences, f, 'stockholm')
            f.close()
            rp.update_file('aln_sto', f.name)
        logging.debug("Reference package written to %s", rp.path)
        yield rp
Beispiel #4
0
def build_index_refpkg(hrefpkg_paths,
                       sequence_file,
                       seqinfo,
                       taxonomy,
                       dest='index.refpkg',
                       **meta):
    """
    Build an index.refpkg from a set of hrefpkgs
    """

    # Clear taxonomy
    taxonomy = copy.deepcopy(taxonomy)
    for node in taxonomy:
        node.sequence_ids = set()

    def sequence_names(f):
        with open(f) as fp:
            r = csv.DictReader(fp)
            for i in r:
                yield i['seqname']

    hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths)
    seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs)

    # Add seqinfo
    for f in seqinfo_files:
        with f:
            taxonomy.populate_from_seqinfo(f)

    # Remove lineages without sequences
    taxonomy.prune_unrepresented()

    sequence_ids = frozenset(taxonomy.subtree_sequence_ids())

    with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \
         util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \
         util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp:
        wrap.esl_sfetch(sequence_file, sequence_ids, tf)
        tf.close()

        # Seqinfo file
        r = (i for i in seqinfo if i['seqname'] in sequence_ids)
        w = csv.DictWriter(seq_info_fp,
                           seqinfo[0].keys(),
                           lineterminator='\n',
                           quoting=csv.QUOTE_NONNUMERIC)
        w.writeheader()
        w.writerows(r)
        seq_info_fp.close()

        taxonomy.write_taxtable(tax_fp)
        tax_fp.close()

        rp = Refpkg(dest, create=True)
        rp.start_transaction()
        rp.update_file('aln_fasta', tf.name)
        rp.update_file('seq_info', seq_info_fp.name)
        rp.update_file('taxonomy', tax_fp.name)
        rp.update_file('profile', wrap.CM)

        for k, v in meta.items():
            rp.update_metadata(k, v)

        rp.commit_transaction()

    return rp, sequence_ids