Beispiel #1
0
def extensions_onto_foundation(otu_file_fh, extension_taxonomy_fh,
                               extension_seq_fh, foundation_alignment_fh,
                               ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fp : folder
        Output folder contains files including:
        a) The Newick formatted ghost-tree, which is the final output of the
           ghost-tree tool. This is a phylogenetic tree designed for
           downstream diversity analyses.
        b) Accession IDs from the ghost-tree.nwk file that you can use for
           downstream analyses tools
        c) log error file (this is an optional file that you can have if you
           type '--stderr')
    """
    global foundation_accession_genus_dic  # needs global assignment for flake8
    foundation_accession_genus_dic = {}
    std_output, std_error = "", ""
    process = subprocess.Popen("muscle",
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    std_output, std_error = process.communicate()
    if re.search("command not found", std_error):
        print "muscle, multiple sequence aligner, is not found. Is it" \
              " installed? Is it in your path?"
    process = subprocess.Popen("fasttree",
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    std_output, std_error = process.communicate()
    std_output, std_error = "", ""
    if re.search("command not found", std_error):
        print "fasttree, phylogenetic tree builder, is not found. Is it" \
              " installed? Is it in your path?"
    os.mkdir("tmp")
    os.mkdir(ghost_tree_fp)
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(
        foundation_alignment_fh, extension_genus_accession_list_dic),
                into=ghost_tree_fp + "/nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree, all_std_error = _make_foundation_tree(
        ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", std_error,
        ghost_tree_fp)
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            process = subprocess.Popen(
                "muscle -in tmp/mini_seq_gt.fasta" + " -out" +
                " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1",
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            std_output, std_error = process.communicate()
            process = subprocess.Popen("fasttree -nt -quiet" +
                                       " tmp/mini_alignment_gt.fasta >" +
                                       " tmp/mini_tree_gt.nwk",
                                       shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            std_output, std_error = process.communicate()
            all_std_error += "FastTree warnings for genus " + key_node + " are:\n" + std_error + "\n"
            mini_tree = read("tmp/mini_tree_gt.nwk",
                             format='newick',
                             into=TreeNode)
            node.extend(mini_tree.root_at_midpoint().children[:])
        except:
            continue
    shutil.rmtree("tmp")
    ghost_tree_nwk = open(ghost_tree_fp + "/ghost_tree.nwk", "w")
    ghost_tree_nwk.write(str(foundation_tree))
    ghost_tree_nwk.close()
    _make_accession_id_file(ghost_tree_fp)
    return str(foundation_tree).strip(), all_std_error
Beispiel #2
0
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh,
                                        extension_seq_fh,
                                        foundation_alignment_fh,
                                        ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fh : filehandle
        The Newick formatted ghost-tree is the final output of the ghost-tree
        tool. This is a phylogenetic tree designed for downstream diversity
        analyses.

    """
    os.system("mkdir tmp")
    global foundation_accession_genus_dic
    foundation_accession_genus_dic = {}
    global seqs
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(
        foundation_alignment_fh, extension_genus_accession_list_dic),
                into="nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta")
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            os.system("muscle -in tmp/mini_seq_gt.fasta -out" +
                      " tmp/mini_alignment_gt.fasta -quiet" +
                      " -maxiters 2 -diags1")
            os.system("fasttree -nt -quiet tmp/mini_alignment_gt.fasta >" +
                      " tmp/mini_tree_gt.nwk")
            mini_tree = read("tmp/mini_tree_gt.nwk",
                             format='newick',
                             into=TreeNode)
            node.extend(mini_tree.children[:])
        except:
            continue
    os.system("rm -r tmp")
    ghost_tree_fp.write(str(foundation_tree))
    return str(foundation_tree).strip()
Beispiel #3
0
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh,
                                        extension_seq_fh,
                                        foundation_alignment_fh,
                                        ghost_tree_fp):
    """Combines two genetic databases into one phylogenetic tree.

    Some genetic databases provide finer taxonomic resolution,
    but high sequence variability causes poor multiple sequence alignments
    (these are the "extension trees"). Other databases provide high quality
    phylogenetic information (hence it is used as the "foundation"), but poor
    taxonomic resolution. This script combines two genetic databases into
    one phylogenetic tree in .nwk format, taking advantage of the benefits
    of both databases, but allowing sequencing to be performed using the
    "extension trees" primer set.

    Parameters
    __________
    otu_file_fh : filehandle
        Tab-delimited text file containing OTU clusters in rows containing
        accession numbers only. Format can be 1) where the accession number
        is in the first column with only one column or 2) it can contain
        accession numbers clustered in tab-delimited rows containing more
        accession numbers, which are part of that OTU cluster (as in output of
        "ghost-tree group-extensions"). This file refers to the "extension
        trees". File references to sequence reads or sample numbers/names are
        not valid here. This is not an OTU .biom table.

    extension_taxonomy_fh : filehandle
        Tab-delimited text file related to "extension trees" wih the 1st
        column being an
        accession number (same accession numbers in otu_file_fh and
        extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in
        the following format:
        k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales;
        f__Sebacinaceae;g__unidentified;s__Sebacina

    extension_seq_fh : filehandle
        The .fasta formated sequences for the "extension trees" genetic
        dataset. Sequence identifiers are the accession numbers. These
        accession numbers are the same as in the otu_file_fh and
        extension_taxonomy_fh.

    foundation_alignment_fh : filehandle
        File containing pre-aligned sequences from a genetic marker database
        in .fasta format. This file refers to the "foundation" of the
        ghost-tree. Contains accession numbers and taxonomy labels.

    ghost_tree_fh : filehandle
        The Newick formatted ghost-tree is the final output of the ghost-tree
        tool. This is a phylogenetic tree designed for downstream diversity
        analyses.

    """
    global foundation_accession_genus_dic  # needs global assignment for flake8
    foundation_accession_genus_dic = {}
    ghost_tree_output = str(ghost_tree_fp)
    ghost_tree_output = ghost_tree_output[16:-4]
    process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()
    if re.search("command not found", error):
        print "muscle, multiple sequence aligner, is not found. Is it" \
              " installed? Is it in your path?"
    process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    output, error = process.communicate()
    if re.search("command not found", error):
        print "fasttree, phylogenetic tree builder, is not found. Is it" \
              " installed? Is it in your path?"
    os.mkdir("tmp")
    logfile = open("ghost-tree_log_"+ghost_tree_output+".txt", "w")
    extension_genus_accession_list_dic = \
        _extension_genus_accession_dic(otu_file_fh,
                                       extension_taxonomy_fh)
    skbio.write(_make_nr_foundation_alignment(foundation_alignment_fh,
                extension_genus_accession_list_dic),
                into="nr_foundation_alignment_gt.fasta",
                format="fasta")
    foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta",
                                            logfile)
    seqs = SequenceCollection.read(extension_seq_fh)
    for node in foundation_tree.tips():
        key_node, _ = str(node).split(":")
        key_node = foundation_accession_genus_dic[key_node]
        try:
            _make_mini_otu_files(key_node, extension_genus_accession_list_dic,
                                 seqs)
            process = subprocess.Popen("muscle -in tmp/mini_seq_gt.fasta" +
                                       " -out" +
                                       " tmp/mini_alignment_gt.fasta -quiet" +
                                       " -maxiters 2 -diags1", shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            process = subprocess.Popen("fasttree -nt -quiet" +
                                       " tmp/mini_alignment_gt.fasta >" +
                                       " tmp/mini_tree_gt.nwk", shell=True,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            logfile.write("FastTree warnings for genus "+key_node+" are:\n" +
                          error + "\n")
            mini_tree = read("tmp/mini_tree_gt.nwk", format='newick',
                             into=TreeNode)
            node.extend(mini_tree.root_at_midpoint().children[:])
        except:
            continue
    shutil.rmtree("tmp")
    ghost_tree_fp.write(str(foundation_tree))
    logfile.close()
    return str(foundation_tree).strip()