Exemple #1
0
def create_tree(filepath='bird_phylogenic_tree.nex', num_trees=1):
    treelist = TreeList.get(path=filepath, schema="nexus")
    if num_trees == -1:
        num_trees = len(treelist)
    maps = []
    for i in range(0, num_trees):
        outer_map = {}
        tree = treelist[i]
        # Iterate from root to tips of tree not including leaves.
        iterator = tree.ageorder_node_iter(include_leaves=False,
                                           descending=True)
        for node in iterator:
            # Looping nodes in tree
            children_iter = node.child_node_iter()
            for child_node in children_iter:
                # Looping through all child of node.
                if child_node.is_leaf():
                    # Add child_node as key to outer_map but first create inner_map
                    leaf_of_node_list = node.leaf_nodes()
                    leaf_of_node_list.remove(child_node)
                    inner_map = create_inner_map(child_node, leaf_of_node_list)
                    child_name = convert_name(child_node.taxon.__str__())
                    outer_map[child_name] = inner_map
        maps.append(outer_map)
    return maps
 def get_bs_trees(self, bin_name):
     tl = TreeList.get(path=os.path.join(self.path, 'supergenes', bin_name,
                                         'RAxML_bootstrap.bootstrap'),
                       preserve_underscores=True,
                       schema='newick')
     tree_upper(tl[0])
     return tl
Exemple #3
0
import argparse
import re
import sys
from dendropy import TreeList

parser = argparse.ArgumentParser(
    description='Check which nodes have duplicated names')
parser.add_argument('treefile',
                    type=argparse.FileType('r'),
                    nargs='+',
                    help='Any number of newick-format tree files')

args = parser.parse_args()

for f in args.treefile:

    trees = TreeList.get(file=f, schema='newick', preserve_underscores=True)

    tree = trees[0]

    count = {}
    for node in tree.preorder_internal_node_iter():
        if node.label:
            count[node.label] = 1 + (count.get(node.label) or 0)

    tot = 0
    for name, n in count.items():
        if n > 1:
            print("Node name '{}' duplicated {} times".format(name, n))
            tot = tot + n
    print("Total dups for {}: {}".format(f.name, tot))
Exemple #4
0
def main():

    print("Launching " + treeshrink.PROGRAM_NAME + " version " + treeshrink.PROGRAM_VERSION)
    

    parser = argparse.ArgumentParser()

    parser.add_argument("-i","--indir",required=False,help="The parent input directory where the trees (and alignments) can be found")
    parser.add_argument("-t","--tree",required=False,help="The name of the input tree/trees. If the input directory is specified (see -i option), each subdirectory under it must contain a tree with this name. Otherwise, all the trees can be included in this one file. Default: input.tre")
    parser.add_argument("-a","--alignment",required=False,help="The name of the input alignment; can only be used when the input directory is specified (see -i option). Each subdirectory under it must contain an alignment with this name. Default: input.fasta")
    parser.add_argument("-c","--centroid",required=False,action='store_true',help="Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO")
    parser.add_argument("-k","--k",required=False,help="The maximum number of leaves that can be removed. Default: auto-select based on the data; see also -s")
    parser.add_argument("-s","--kscaling",required=False,help="If -k not given, we use k=min(n/a,b*sqrt(n)) by default; using this option, you can set the a,b constants; Default: '5,2'")
    parser.add_argument("-q","--quantiles",required=False,help="The quantile(s) to set threshold. Default is 0.05")
    parser.add_argument("-b","--minimpact",required=False,help="Do not remove species on the per-species test if their impact on diameter is less than MINIPACT%% where x is the given value. Default: 5")
    parser.add_argument("-m","--mode",required=False,help="Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto")
    parser.add_argument("-o","--outdir",required=False,help="Output directory. Default: the same as input directory (if it is specified) or the same as the input trees")
    parser.add_argument("-p","--tempdir",required=False,help="Directory to keep temporary files. If specified, the temp files will be kept")
    parser.add_argument("-r","--libdir",required=False,help="Directory of the R libraries and scripts. Default: 2 layers above the treeshrink package")

    args = vars(parser.parse_args())


    MIN_OCC = 20
    MIN_TREE_NUM = 20

    libdir = args["libdir"] if args["libdir"] else dirname(dirname(realpath(treeshrink.__file__)))

    tempdir = set_tmp_dir(args["tempdir"])  
    
    quantiles = [ q for q in args["quantiles"].split()] if args["quantiles"] else ["0.05"]
    
    minimpact = (float(args["minimpact"])/100)+1 if args["minimpact"] else 1.05
    
    scaling = [int(x) for x in args["kscaling"].split(",")] if  args["kscaling"] else [5,2]

    if args["indir"]:
        treename = splitext(args["tree"])[0] if args["tree"] else "input"
        subdirs = [d for d in listdir(args["indir"]) if exists(normpath(join(args["indir"],d,args["tree"] if args["tree"] else "input.tre")))]
        intrees = get_tmp_file(treename + ".trees")
        with open(intrees,'w') as fout:
            for d in subdirs:
                treename = args["tree"] if args["tree"] else "input.tre"
                treefile = normpath(join(args["indir"],d,treename))
                if exists(treefile):
                    fout.write(open(treefile,'r').read())                
    else:
        intrees = args["tree"]


    mode = args["mode"] if args["mode"] else 'auto'

    k = int(args["k"]) if args["k"] else None

    if args["outdir"]:
        outdir = args["outdir"] 
        check_dir(outdir)
    elif args["indir"]:
        outdir = args["indir"]
    else:
        outdir = splitext(intrees)[0] + "_treeshrink"
        mkdir(outdir)

    ''' Check to make sure output can be written'''
    if args["indir"]:
        i = 0
        fName,ext = splitext(basename(intrees))
        for sd in subdirs:
            outfile = normpath(join(outdir,sd, fName + "_shrunk_RS_" + quantiles[i] + ".txt"))
            with open(outfile,'w') as f:
                pass


    trees = TreeList.get(path=intrees,schema='newick',preserve_underscores=True)

    if mode=='auto' and len(trees) < MIN_TREE_NUM:
        print("There are only " + str(len(trees)) + " gene trees in the dataset.")
        print("TreeShrink will run in 'All-genes' mode")
        mode='all-genes'

    gene_list = [[] for i in range(len(trees))]
    species_map = {}
    occ = {}
    removing_sets = [ [ [ ] for i in range(len(trees)) ] for j in range(len(quantiles)) ]

    for t,a_tree in enumerate(trees):
        # solve k-shrink
        a_filter = TreeFilter(ddpTree=a_tree,centroid_reroot=args["centroid"],scaling=scaling)
        a_filter.optFilter(d=k)

        # compute species feature (i.e. the max ratio associated with each species for this gene tree)
        mapping = {}
        #print(a_filter.min_diams)
        for i in range(1,len(a_filter.min_diams)):
            if a_filter.min_diams[i] == 0:
                print("Warning: tree %d has no diameter (has only zero branch lengths) after removing %d sequences." %(t+1,i))
                break
            r = a_filter.min_diams[i-1]/a_filter.min_diams[i]
            removals = a_filter.list_removals(d=i)
            for s in removals:
                mapping[s] = r if s not in mapping else max(mapping[s],r)
        
        # gather per-species distributions and per-gene species features
        for s in mapping:
            if mode == 'per-species' or mode == 'auto':
                species_map[s] = [mapping[s]] if s not in species_map else species_map[s]+[mapping[s]]
            if mode == 'per-species' or mode == 'all-genes' or mode == 'auto':
                gene_list[t].append((s,mapping[s]))
        
        # fit kernel density to this gene's species features (per-gene mode)
        if mode == 'per-gene':
            filename = get_tmp_file("gene_%s.dat" %str(t))
            with open(filename,'w') as f:
                for s in mapping:
                    f.write(str(mapping[s]))
                    f.write("\n")
                #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping)
                #for i in range(n_missing):
                #    f.write("1.0")
                #    f.write("\n")
            if len(mapping) > 1:
                for i,q in enumerate(quantiles):
                    threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_loglnorm.R")),filename,q]).lstrip().rstrip()[4:]) 
                    #print("Threshold: ", threshold)
                    for s in mapping:
                        if mapping[s] > threshold: 
                            removing_sets[i][t].append(s)
        # update taxon occupancy (only for per-species mode)
        if mode == 'per-species' or mode == 'auto':
            for n in a_tree.leaf_node_iter():
                s = n.taxon.label
                occ[s] = 1 if not s in occ else occ[s]+1
    
    if mode == 'auto' or mode == 'per-species':
        flag = False
        for s in occ:
            if occ[s] < MIN_OCC:
                print ("Species " + s + " only exists in " + str(occ[s]) + " gene trees")
                flag = True
        if flag:
            if mode == 'auto':
                mode = 'all-genes'
                print ("There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode")
            else:
                print ("WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode")
        elif mode == 'auto':
            mode = 'per-species'
            print("Finish preprocessing. TreeShrink will run in 'Per-species' mode ...    ")

# fit kernel density to the per-species distributions and compute per-species threshold (per-species mode)
    if mode == 'per-species':
        for s in sorted(species_map):
            l = len(species_map[s])
            for i in range(occ[s]-l):
                species_map[s].append(1)
            filename = get_tmp_file(s + ".dat")
            with open(filename,'w') as f:
                for v in species_map[s]:
                    f.write(str(v))
                    f.write("\n")
            thresholds = [ 0 for i in range(len(quantiles)) ]        
            for i,q in enumerate(quantiles): 
                thresholds[i] = max(minimpact,float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:]))
                print("%s:\n\t will be cut in %d trees where its impact is above %f for quantile %s" %(s,sum(1 for x in species_map[s] if x>thresholds[i]),thresholds[i],q,))
            species_map[s] = (species_map[s],thresholds)

        for t,gene in enumerate(gene_list):
            for s,r in gene:
                for i,threshold in enumerate(species_map[s][1]):
                    if r > threshold:
                        removing_sets[i][t].append(s)
                    

# fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode) 
    if mode == 'all-genes':
        filename = get_tmp_file("all_genes" + ".dat")
        with open(filename,'w') as f:
            for gene in gene_list:
                for s,r in gene:
                    f.write(str(r))
                    f.write("\n")
        for i,q in enumerate(quantiles):
            threshold = float(check_output(["Rscript",normpath(join(libdir,"R_scripts","find_threshold_lkernel.R")),libdir,filename,q]).lstrip().rstrip()[5:])
            for t,gene in enumerate(gene_list):
                for s,r in gene:
                    if r > threshold:
                        removing_sets[i][t].append(s)

    print("Writing output ...\n")
# Dendropy's filter_leaf_nodes() seems to have problem
# i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration)
# use home-made code to prune the tree instead

    #treeName,treeExt = splitext(basename(intrees))
    #outtrees = args["output"] if args["output"] else treeName + "_shrunk" + treeExt
    fName,ext = splitext(basename(intrees))
    
    for i,RS in enumerate(removing_sets):
        trees_shrunk = deepcopy(trees)
        
        if args["indir"] is None:
            outfile = normpath(join(outdir,fName + "_RS_" + quantiles[i] + ".txt"))
            with open(outfile,'w') as f:
                for item in RS:
                    for s in item:
                        f.write(s + "\t")
                    f.write("\n")
            for tree,rs in zip(trees_shrunk,RS):
                prune_tree(tree,rs)
            trees_shrunk.write_to_path(normpath(join(outdir,fName + "_" + quantiles[i] + ext)),'newick')  
        else:
            for sd,item in zip(subdirs,RS):
                outfile = normpath(join(outdir,sd, fName + "_shrunk_RS_" + quantiles[i] + ".txt"))
                with open(outfile,'w') as f:
                    for s in item:
                        f.write(s + "\t")
            for sd,tree,rs in zip(subdirs,trees_shrunk,RS):
                L = set(x.taxon.label for x in tree.leaf_node_iter())
                prune_tree(tree,rs)
                treeName,treeExt = splitext(args["tree"])
                treefile = normpath(join(outdir,sd, treeName + "_shrunk_" + quantiles[i] + treeExt))
                tree.write_to_path(treefile,'newick',unquoted_underscores=True,real_value_format_specifier=".16g")
                
                aln_filename = args["alignment"] if args["alignment"] else "input.fasta"
                alnName,alnExt = splitext(aln_filename)
                input_aln = normpath(join(args["indir"],sd,aln_filename))
                if isfile(input_aln): 
                    output_aln = normpath(join(outdir,sd,alnName+"_shrunk"+quantiles[i]+alnExt))
                    alg = CompactAlignment()
                    alg.read_file_object(input_aln,'fasta')
                    S=set(alg.keys())
                    if (L.difference(alg.keys())) or S.difference(L):
                        print("ERROR: For gene %s, alignment names don't match tree names. Will skip it.\n\tonly in tree:\t%s\n\tonly in alignment:\t%s"%(sd,str(L.difference(S)),str(S.difference(L))))
                    else:
                        alg.remove_all(rs)
                        alg.mask_gapy_sites(1)
                        alg.write(output_aln,'fasta')

    if not args["tempdir"]:
        rmtree(tempdir)
#    call(["rm","-r",tempdir])

    print("Output files written to " + outdir) 
Exemple #5
0
def main():
    import treeshrink
    from treeshrink.optimal_filter_lib import TreeFilter
    from treeshrink.tree_lib import prune_tree
    from sys import argv, stdout
    from math import sqrt
    from subprocess import check_output, call
    import argparse
    from dendropy import Tree, TreeList
    from os.path import basename, dirname, splitext, realpath, join, normpath
    from os import mkdir, getcwd, rmdir
    from copy import deepcopy
    from tempfile import mkdtemp
    from shutil import rmtree
    import dendropy

    print("Launching " + treeshrink.PROGRAM_NAME + " version " +
          treeshrink.PROGRAM_VERSION)

    parser = argparse.ArgumentParser()

    parser.add_argument("-i", "--input", required=True, help="Input trees")
    parser.add_argument(
        "-d",
        "--outdir",
        required=False,
        help="Output directory. Default: inferred from the input trees")
    parser.add_argument(
        "-t",
        "--tempdir",
        required=False,
        help=
        "Directory to keep temporary files. If specified, the temp files will be kept"
    )
    parser.add_argument(
        "-o",
        "--output",
        required=False,
        help=
        "The name of the output trees. Default: inferred from the input trees")
    parser.add_argument(
        "-c",
        "--centroid",
        required=False,
        action='store_true',
        help=
        "Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO"
    )
    parser.add_argument(
        "-k",
        "--k",
        required=False,
        help=
        "The maximum number of leaves that can be removed. Default: auto-select based on the data"
    )
    parser.add_argument(
        "-q",
        "--quantiles",
        required=False,
        help="The quantile(s) to set threshold. Default is 0.05")
    parser.add_argument(
        "-m",
        "--mode",
        required=False,
        help=
        "Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto"
    )

    wdir = dirname(realpath(__file__))

    args = vars(parser.parse_args())

    MIN_OCC = 20
    MIN_TREE_NUM = 20

    quantiles = [q for q in args["quantiles"].split()
                 ] if args["quantiles"] else ["0.05"]
    #print(quantiles)

    intrees = args["input"]
    treeName, treeExt = splitext(basename(intrees))
    outtrees = args["output"] if args[
        "output"] else treeName + "_shrunk" + treeExt

    mode = args["mode"] if args["mode"] else 'auto'

    k = int(args["k"]) if args["k"] else None

    outdir = args["outdir"] if args["outdir"] else splitext(
        intrees)[0] + "_treeshrink"
    mkdir(outdir)
    if args["tempdir"]:
        tempdir = args["tempdir"]
        mkdir(tempdir)
    else:
        tempdir = mkdtemp()  #check_output(["mktemp","-d"]).rstrip()

    trees = TreeList.get(path=intrees,
                         schema='newick',
                         preserve_underscores=True)
    if mode == 'auto' and len(trees) < MIN_TREE_NUM:
        print("There are only " + str(len(trees)) +
              " gene trees in the dataset.")
        print("TreeShrink will run in 'All-genes' mode")
        mode = 'all-genes'

    gene_list = [[] for i in range(len(trees))]
    species_map = {}
    occ = {}
    removing_sets = [[[] for i in range(len(trees))]
                     for j in range(len(quantiles))]

    for t, a_tree in enumerate(trees):
        # solve k-shrink
        a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"])
        a_filter.optFilter(d=k)

        # compute species feature (i.e. the max ratio associated with each species for this gene tree)
        mapping = {}
        for i in range(1, len(a_filter.min_diams)):
            r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
            removals = a_filter.list_removals(d=i)
            for s in removals:
                mapping[s] = r if s not in mapping else max(mapping[s], r)

        # gather per-species distributions and per-gene species features
        for s in mapping:
            if mode == 'per-species' or mode == 'auto':
                species_map[s] = [
                    mapping[s]
                ] if s not in species_map else species_map[s] + [mapping[s]]
            if mode == 'per-species' or mode == 'all-genes' or mode == 'auto':
                gene_list[t].append((s, mapping[s]))

        # fit kernel density to this gene's species features (per-gene mode)
        if mode == 'per-gene':
            filename = normpath(join(tempdir, "gene_" + str(t) + ".dat"))
            with open(filename, 'w') as f:
                for s in mapping:
                    f.write(str(mapping[s]))
                    f.write("\n")
                #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping)
                #for i in range(n_missing):
                #    f.write("1.0")
                #    f.write("\n")
            if len(mapping) > 1:
                for i, q in enumerate(quantiles):
                    threshold = float(
                        check_output([
                            "Rscript",
                            normpath(
                                join(wdir, "R_scripts",
                                     "find_threshold_loglnorm.R")), filename, q
                        ]).lstrip().rstrip()[4:])
                    #print("Threshold: ", threshold)
                    for s in mapping:
                        if mapping[s] > threshold:
                            removing_sets[i][t].append(s)
        # update taxon occupancy (only for per-species mode)
        if mode == 'per-species' or mode == 'auto':
            for n in a_tree.leaf_node_iter():
                s = n.taxon.label
                occ[s] = 1 if not s in occ else occ[s] + 1

    if mode == 'auto' or mode == 'per-species':
        flag = False
        for s in occ:
            if occ[s] < MIN_OCC:
                print("Species " + s + " only exists in " + str(occ[s]) +
                      " gene trees")
                flag = True
        if flag:
            if mode == 'auto':
                mode = 'all-genes'
                print(
                    "There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode"
                )
            else:
                print(
                    "WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode"
                )
        elif mode == 'auto':
            mode = 'per-species'
            print(
                "Finish preprocessing. TreeShrink will run in 'Per-species' mode"
            )

# fit kernel density to the per-species distributions and compute per-species threshold (per-species mode)
    if mode == 'per-species':
        for s in species_map:
            l = len(species_map[s])
            for i in range(occ[s] - l):
                species_map[s].append(1)
            filename = normpath(join(tempdir, s + ".dat"))
            with open(filename, 'w') as f:
                for v in species_map[s]:
                    f.write(str(v))
                    f.write("\n")
            thresholds = [0 for i in range(len(quantiles))]
            for i, q in enumerate(quantiles):
                thresholds[i] = float(
                    check_output([
                        "Rscript",
                        normpath(
                            join(wdir, "R_scripts",
                                 "find_threshold_lkernel.R")), wdir, filename,
                        q
                    ]).lstrip().rstrip()[5:])
            species_map[s] = (species_map[s], thresholds)

        for t, gene in enumerate(gene_list):
            for s, r in gene:
                for i, threshold in enumerate(species_map[s][1]):
                    if r > threshold:
                        removing_sets[i][t].append(s)

# fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode)
    if mode == 'all-genes':
        filename = normpath(join(tempdir, "all_genes" + ".dat"))
        with open(filename, 'w') as f:
            for gene in gene_list:
                for s, r in gene:
                    f.write(str(r))
                    f.write("\n")
        for i, q in enumerate(quantiles):
            threshold = float(
                check_output([
                    "Rscript",
                    normpath(
                        join(wdir, "R_scripts", "find_threshold_lkernel.R")),
                    wdir, filename, q
                ]).lstrip().rstrip()[5:])
            for t, gene in enumerate(gene_list):
                for s, r in gene:
                    if r > threshold:
                        removing_sets[i][t].append(s)

# Dendropy's filter_leaf_nodes() seems to have problem
# i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration)
# use home-made code to prune the tree instead

    treeName, treeExt = splitext(outtrees)
    fName, ext = splitext(outtrees)
    for i, RS in enumerate(removing_sets):
        trees_shrunk = deepcopy(trees)
        outfile = normpath(join(outdir,
                                fName + "_RS_" + quantiles[i] + ".txt"))
        with open(outfile, 'w') as f:
            for item in RS:
                for s in item:
                    f.write(s + "\t")
                f.write("\n")
        for t, tree in enumerate(trees_shrunk):
            #filt = lambda node: False if (node.taxon is not None and node.taxon.label in RS[t]) else True
            #tree.filter_leaf_nodes(filt,update_bipartitions=True)
            prune_tree(tree, RS[t])
        trees_shrunk.write_to_path(
            normpath(join(outdir, treeName + "_" + quantiles[i] + treeExt)),
            'newick')

    if not args["tempdir"]:
        rmtree(tempdir)


#    call(["rm","-r",tempdir])

    print("Output files written to " + outdir)
Exemple #6
0
#! /usr/bin/env python

from dendropy import TreeList
from sys import argv
from tree_lib import compute_diameter

infile = argv[1]

treelist = TreeList.get(path=infile, schema="newick")

compute_diameter(treelist)
def main(args):
    import os
    import itertools
    import subprocess
    from dendropy import TreeList
    from dendropy.calculate import treecompare
    import ts_extras

    def ts_txts_to_trees(ts_nodes, ts_edges, trees_outname=None):
        import shutil
        import msprime
        logging.info("== Converting new ts ARG to .trees ===")
        try:
            ts = msprime.load_text(nodes=ts_nodes, edges=ts_edges)
        except:
            logging.warning(
                "Can't load the texts file properly. Saved copied to 'bad.nodes' & 'bad.edges' for inspection"
            )
            shutil.copyfile(ts_nodes.name, "bad.nodes")
            shutil.copyfile(ts_edges.name, "bad.edges")
            raise
        logging.info("== loaded {}, {}===".format(ts_nodes.name,
                                                  ts_edges.name))
        try:
            simple_ts = ts.simplify()
        except:
            ts.dump("bad.trees")
            logging.warning(
                "Can't simplify. .trees file dumped to 'bad.trees'")
            raise
        if trees_outname:
            simple_ts.dump(trees_outname)
        return (simple_ts)

    msprime.TreeSequence.write_nexus_trees = ts_extras.write_nexus_trees
    iterations = 20
    full_prefix = os.path.join(
        args.outputdir,
        os.path.splitext(os.path.basename(args.trees_file))[0])
    with open(full_prefix + ".sites", "w+") as aw_in:
        tsfile_to_ARGweaver_in(args.trees_file, aw_in)
        cmd = [
            os.path.join(args.ARGweaver_executable_dir,
                         args.ARGweaver_sample_executable), '--sites',
            aw_in.name, '--popsize',
            str(args.effective_population_size), '--recombrate',
            str(args.recombination_rate), '--mutrate',
            str(args.mutation_rate), '--overwrite', '--randseed',
            str(int(args.random_seed)), '--iters',
            str(iterations), '--sample-step',
            str(iterations), '--output', full_prefix
        ]
        assert os.stat(aw_in.name).st_size > 0, "Initial .sites file is empty"
        logging.debug("running '{}'".format(" ".join(cmd)))
        subprocess.call(cmd)
        #now check that the smc file produced can be converted to nodes
        smc = full_prefix + "." + str(iterations) + ".smc.gz"
        assert os.path.isfile(smc), "No output file names {}".format(smc)
        smc_nex = smc.replace(".smc.gz", ".nex")
        with open(smc_nex, "w+") as smc_nex_out:
            ARGweaver_smc_to_nexus(smc, smc_nex_out)
        arg_nex = smc.replace(".smc.gz", ".ts_nex")
        with open(smc.replace(".smc.gz", ".TSnodes"), "w+") as nodes, \
            open(smc.replace(".smc.gz", ".TSedges"), "w+") as edges, \
            open(arg_nex, "w+") as ts_nex:
            ARGweaver_smc_to_ts_txts(
                os.path.join(args.ARGweaver_executable_dir,
                             args.ARGweaver_smc2arg_executable),
                smc.replace(".smc.gz", ""), nodes, edges)

            ts = ts_txts_to_trees(nodes, edges)
            ts.write_nexus_trees(ts_nex)

        smc_trees = TreeList.get(path=smc_nex, schema="nexus")
        arg_trees = TreeList.get(path=arg_nex,
                                 schema="nexus",
                                 taxon_namespace=smc_trees[0].taxon_namespace)
        #zero_based_tip_numbers assumed False)
        #Check the smc trees against the ts-imported equivalents
        #NB, the ARGweaver output does not specify where mutations occur on the ARG, so we cannot
        #reconstruct the sequences implied by this ARG for testing purposes, and thus cannot compare
        #the original sequences with the reconstructed ones

        assert len(smc_trees) == len(arg_trees)
        assert [int(float(t.label)) for t in smc_trees
                ] == [int(float(t.label)) for t in arg_trees]
        for i, (smc_tree, arg_tree) in enumerate(zip(smc_trees, arg_trees)):
            if treecompare.symmetric_difference(smc_tree, arg_tree) == 0:
                print(
                    "✓ Tree " + str(i + 1) +
                    " in AW SMC file is identical to that produced by SMC->ARG->STS"
                )
            else:
                raise Exception("Tree {} differs\n".format(i+1) + \
                    smc_tree.label + " (smc) = " + smc_tree.as_string(schema="newick",
                        suppress_edge_lengths=True,
                        suppress_internal_node_labels = True,
                        suppress_rooting = True) + \
                    arg_tree.label + " (arg) = " + arg_tree.as_string(schema="newick",
                        suppress_edge_lengths=True,
                        suppress_internal_node_labels = True,
                        suppress_rooting = True))
Exemple #8
0
import sys
from warnings import warn
from dendropy import TreeList
from collections import OrderedDict

parser = argparse.ArgumentParser(
    description=
    'Add genus names to nodes on the tree, for each monophyletic genus')
parser.add_argument('treefile',
                    type=argparse.FileType('r'),
                    help='A newick-format tree')

args = parser.parse_args()

trees = TreeList.get(file=args.treefile,
                     schema='newick',
                     preserve_underscores=True,
                     rooting='default-rooted')

tree = trees[0]

#compile a list of genus names

count = {}
for node in tree.preorder_internal_node_iter():
    if node.label:
        nl = re.sub(r'_\d+_$', '', node.label).lower()
        count[nl] = 1 + (count.get(nl) or 0)

dups = {name: 0 for name, n in count.items() if n > 1}

#collect a list of genus names
#!/opt/local/bin/python

### Imports ###
import dendropy
from dendropy import TreeList,Tree
import sys
import argparse
from os import walk
import glob


### Main ###

### Argparse
parser = argparse.ArgumentParser(description="Reads a newick trees and reroots it with a basal trifurcation",prog="strictunroot.py")
parser.add_argument("-i",required=True,type=str,help="Input newick tree name")
parser.add_argument("-o",required=True,type=str,help="Output file name")
args = parser.parse_args()

###Main
itrees=TreeList.get(path=args.i,schema="newick",rooting="default-rooted",preserve_underscores=True)
otrees=TreeList()
for tree in itrees:
    tree.collapse_basal_bifurcation()
    otrees.append(tree)
otrees.write(path=args.o,schema="newick",unquoted_underscores=True,suppress_rooting=True)
print("Done!")
Exemple #10
0
                    metavar="input")
parser.add_argument("-c",
                    type=str,
                    help="Tree to constrain the search ala RAxML's -g",
                    metavar="constrain")
parser.add_argument("-o", type=str, help="Output file name", metavar="output")
#parser.add_argument("-s",type=int,help="Random number generator seed",metavar="seed")
args = parser.parse_args()

###Random number machinery initialization
#if args.s:
#	seed=args.s
#else:
#	seed=random.randint(0,sys.maxint)

#random.seed(seed)
#print("Seed: %d" % seed)

###Input trees
gene_trees = TreeList.get(path=args.i,
                          schema="newick",
                          rooting="force-unrooted")
constrainTree = Tree.get(path=args.c, schema="newick")
consensus = gene_trees.constrained_consensus(constrainTree=constrainTree,
                                             summarize_splits=False,
                                             min_freq=0)

#Write gene trees
consensus.write(path=args.o, schema="newick", suppress_rooting=True)
print("Done!")
Exemple #11
0
'''Label all unnamed nodes with an underscore + number.
'''

import argparse
import re
import sys
from warnings import warn
from dendropy import TreeList
from collections import OrderedDict

parser = argparse.ArgumentParser(description='Add genus names to nodes on the tree, for each monophyletic genus')
parser.add_argument('treefile', type=argparse.FileType('r'), help='A newick-format tree')

args = parser.parse_args()

trees = TreeList.get(file=args.treefile, schema='newick', preserve_underscores=True, rooting='default-rooted')

tree = trees[0]

#compile a list of genus names

count = {}
for node in tree.preorder_internal_node_iter():
    if node.label:
        nl = re.sub(r'_\d+_$','', node.label).lower()
        count[nl] = 1+ (count.get(nl) or 0)

dups = {name:0 for name,n in count.items() if n > 1}

#collect a list of genus names
genera = OrderedDict()
Exemple #12
0
from os import walk
import glob

### Main ###

### Argparse
parser = argparse.ArgumentParser(
    description="Reads a newick trees and reroots it with a basal trifurcation",
    prog="strictunroot.py")
parser.add_argument("-i",
                    required=True,
                    type=str,
                    help="Input newick tree name")
parser.add_argument("-o", required=True, type=str, help="Output file name")
args = parser.parse_args()

###Main
itrees = TreeList.get(path=args.i,
                      schema="newick",
                      rooting="default-rooted",
                      preserve_underscores=True)
otrees = TreeList()
for tree in itrees:
    tree.collapse_basal_bifurcation()
    otrees.append(tree)
otrees.write(path=args.o,
             schema="newick",
             unquoted_underscores=True,
             suppress_rooting=True)
print("Done!")