Exemple #1
0
def dendropy_read_treefile(treefiles, quiet=False, preserve_underscores=False, **kwargs):
    out_stream = kwargs.pop('writer', sys.stderr)
    intrees = TreeList()
    if not treefiles:
        if not quiet:
            sys.stderr.write('NOTE: reading trees from stdin\n')
        trees = sys.stdin.read()
        #try two input formats
        try:
            intrees.extend(TreeList.get_from_string(trees, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))
        except (DataParseError, NexusReader.NotNexusFileError) as e:
            sys.stderr.write('%s\n' % e.message)
            intrees.extend(TreeList.get_from_string(trees, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))
        except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError)  as e:
            if not quiet:
                sys.stderr.write('%s\n' % e.message)
                sys.exit('Could not read file %s in nexus or newick  format ...\n' % tf)
    else:
        for tf in treefiles:
            if not os.path.isfile(tf):
                out_stream.write('TreeFile %s  does not exist' % tf)
                sys.exit()

            #try two input formats
            try:
                if not quiet:
                    out_stream.write('Reading file %s in nexus format ...\n' % tf)
                intrees.extend(TreeList.get_from_path(tf, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))

            #except (DataParseError, dendropy.dataio.nexusreader.NotNexusFileError) as e:
            except (DataParseError, NexusReader.NotNexusFileError, AttributeError) as e:
                try:
                    if not quiet:
                        out_stream.write('Reading file %s in newick format ...\n' % tf)
                    intrees.extend(TreeList.get_from_path(tf, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs))
                except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError)  as e:
                    if not quiet:
                        sys.stderr.write('%s\n' % e.message)
                        sys.exit('Could not read file %s in nexus or newick  format ...\n' % tf)
    return intrees
Exemple #2
0
                    required=True,
                    help="Sampling time")
parser.add_argument("-r", "--rootAge", required=False, help="Root age")
parser.add_argument("-t",
                    "--timeTree",
                    required=True,
                    help="The output trees with branch lengths in time unit")
parser.add_argument("-c",
                    "--composite",
                    required=False,
                    action='store_true',
                    help="Do composite optimization. Default: NO")

args = vars(parser.parse_args())

myTrees = TreeList.get_from_path(args["input"], 'newick')
smpl_times = {}
rootAge = float(args["rootAge"]) if args["rootAge"] else None

with open(args["samplingTime"], "r") as fin:
    fin.readline()
    for line in fin:
        name, time = line.split()
        smpl_times[name] = float(time)

for tree in myTrees:
    if args["composite"]:
        s = calibrate_composite_opt(tree, smpl_times, root_age=rootAge)
    else:
        s = calibrate_log_opt(tree,
                              smpl_times,
def do_sim(birth_rate   , death_rate, num_leaves, rng=None):
    temp_dir = tempfile.mkdtemp()
    model_tree = treesim.birth_death(birth_rate=birth_rate,
                            death_rate=death_rate,
                            ntax=num_leaves,
                            rng=rng)
    ################################################################################
    # Calling seq-gen
    mtf = os.path.join(temp_dir, 'simtree')
    print "temp_dir =", temp_dir
    treefile_obj = open(mtf, 'w')
    treefile_obj.write("%s;\n" % str(model_tree))
    # CLOSING THE FILE IS IMPORTANT!  This flushes buffers, assuring that the data
    #  will be written to the filesystem before seq-gen is invoked.
    treefile_obj.close() 
    
    
    import subprocess
    command_line = ['seq-gen',
                    '-mHKY',
                    '-on',
                ]
    if os.environ.get('TREE_INF_TEST_RAND_NUMBER_SEED'):
        sg_seed = seed
        
    else:
        if rng is None:
            sg_seed = random.randint(0,100000)
        else:
            sg_seed = rng.randint(0,100000)
    command_line.append('-z%d' % sg_seed)
    command_line.append('simtree')
    
    seq_gen_proc = subprocess.Popen(command_line,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    cwd=temp_dir)
    
    dataset = seq_gen_proc.communicate()[0]
    
    
    # seq-gen does not exit with an error code when it fails.  I don't know why!!
    if seq_gen_proc.returncode != 0 or len(dataset) == 0:
        sys.exit('seq-gen failed!\n')
    sd = os.path.join(temp_dir, 'simdata.nex')
    d = open(sd, 'w')
    d.write(dataset)
    # CLOSING THE FILE IS IMPORTANT!  This flushes buffers, assuring that the data
    #  will be written to the filesystem before PAUP is invoked.
    d.close()
    
    ################################################################################
    # PAUP
    pcf = os.path.join(temp_dir, 'execute_paup.nex')
    pc = open(pcf, 'w')
    pc.write('''execute simdata.nex ; 
    hsearch nomultrees ; 
    savetree file=inferred.tre format = NEXUS;
    quit;
    ''')
    pc.close()
    paup_proc = subprocess.Popen(['paup', '-n', pcf], 
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 cwd=temp_dir)
    (o, e) = paup_proc.communicate()
    
    paup_output = os.path.join(temp_dir, 'inferred.tre')
    # seq-gen does not exit with an error code when it fails.  I don't know why!!
    if paup_proc.returncode != 0 or not os.path.exists(paup_output):
        sys.exit(e)
    
    
    # read true tree with the inferred tree (because it is nexus)
    inf_tree_list = TreeList.get_from_path(paup_output, 
                                           "NEXUS",
                                           taxon_set=model_tree.taxon_set)
    assert len(inf_tree_list) == 1
    inferred_tree = inf_tree_list[0]
    
    # determine which splits were missed
    treesplit.encode_splits(inferred_tree)
    treesplit.encode_splits(model_tree)
    missing = model_tree.find_missing_splits(inferred_tree)
    # sort the nodes of the true tree by depth and ask whether or not they were recovered
    node_depth_TF_list = []
    for node in model_tree.postorder_node_iter():
        children = node.child_nodes()
        if children and node.parent_node:
            first_child = children[0]
            node.depth = first_child.depth + first_child.edge.length
            if node.edge.split_bitmask in missing:
                recovered = 0
            else:
                recovered = 1
            node_depth_TF_list.append((node.depth, node.edge.length, recovered))
        else:
            node.depth = 0.0
    
    node_depth_TF_list.sort()
    
    os.remove(pcf)
    os.remove(paup_output)
    os.remove(sd)
    os.remove(mtf)
    os.rmdir(temp_dir)
    
    return node_depth_TF_list
Exemple #4
0
mode = args["mode"] if args["mode"] else 'per-species'

print(mode)

k = int(args["k"]) if args["k"] else None

outdir = args["outdir"] if args["outdir"] else splitext(
    intrees)[0] + "_kshrink"
mkdir(outdir)
if args["tempdir"]:
    tempdir = args["tempdir"]
    mkdir(tempdir)
else:
    tempdir = check_output(["mktemp", "-d"]).rstrip()

trees = TreeList.get_from_path(intrees, 'newick', preserve_underscores=True)
gene_list = [[] for i in range(len(trees))]
species_map = {}
occ = {}
removing_sets = [[[] for i in range(len(trees))]
                 for j in range(len(quantiles))]

for t, a_tree in enumerate(trees):
    # solve k-shrink
    a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"])
    a_filter.optFilter(d=k)

    # compute species feature (i.e. the max ratio associated with each species for this gene tree)
    mapping = {}
    for i in range(1, len(a_filter.min_diams)):
        r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
import dendropy
from dendropy import TreeList,Taxon,Node
import sys
import argparse

parser = argparse.ArgumentParser(description="Parses a Newick tree file, modifying the branch lengths from number of generations to years and adding an outgroup")
parser.add_argument("-gt",type=float,default=0,required=False,help="Generation time")
parser.add_argument("-od",type=float,default=0,required=False,help="Outgroup branch length")
parser.add_argument("-i",type=str,default="infile.tree",required=True,help="Input Newick tree file")
parser.add_argument("-o",type=str,default="outtree.tree",required=False,help="Output Newick tree file")
args = parser.parse_args()

trees=TreeList.get_from_path(args.i,schema="newick",rooting="force-rooted")
if args.gt != 0:
	print "Scaling branch lengths to time with generation time %d\n" % args.gt
	for tree in trees:
		for edge in tree.preorder_edge_iter():
			#print "DEBUG: %s" % edge.length
			if edge.length != None:
				edge.length=edge.length/args.gt

if args.od != 0:
	print "Adding outgroup with branch length %d\n" % args.od
	namespace=trees.taxon_namespace
	outgroup= Taxon("outgroup")
	namespace.add_taxon(outgroup)
	ntree=0
	labels=namespace.labels()
	labels.remove("outgroup")
	for tree in trees:
		outgroup_node=Node(taxon=outgroup,edge_length=args.od)
Exemple #6
0
def main():
    import treeshrink
    from treeshrink.optimal_filter_lib import TreeFilter
    from treeshrink.tree_lib import prune_tree
    from sys import argv, stdout
    from math import sqrt
    from subprocess import check_output, call
    import argparse
    from dendropy import Tree, TreeList
    from os.path import basename, dirname, splitext, realpath, join, normpath
    from os import mkdir, getcwd, rmdir
    from copy import deepcopy
    from tempfile import mkdtemp
    from shutil import rmtree
    import dendropy

    print("Launching " + treeshrink.PROGRAM_NAME + " version " +
          treeshrink.PROGRAM_VERSION)

    parser = argparse.ArgumentParser()

    parser.add_argument("-i", "--input", required=True, help="Input trees")
    parser.add_argument(
        "-d",
        "--outdir",
        required=False,
        help="Output directory. Default: inferred from the input trees")
    parser.add_argument(
        "-t",
        "--tempdir",
        required=False,
        help=
        "Directory to keep temporary files. If specified, the temp files will be kept"
    )
    parser.add_argument(
        "-o",
        "--output",
        required=False,
        help=
        "The name of the output trees. Default: inferred from the input trees")
    parser.add_argument(
        "-c",
        "--centroid",
        required=False,
        action='store_true',
        help=
        "Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO"
    )
    parser.add_argument(
        "-k",
        "--k",
        required=False,
        help=
        "The maximum number of leaves that can be removed. Default: auto-select based on the data"
    )
    parser.add_argument(
        "-q",
        "--quantiles",
        required=False,
        help="The quantile(s) to set threshold. Default is 0.05")
    parser.add_argument(
        "-m",
        "--mode",
        required=False,
        help=
        "Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto"
    )

    wdir = dirname(realpath(__file__))

    args = vars(parser.parse_args())

    MIN_OCC = 20
    MIN_TREE_NUM = 20

    quantiles = [q for q in args["quantiles"].split()
                 ] if args["quantiles"] else ["0.05"]
    #print(quantiles)

    intrees = args["input"]
    treeName, treeExt = splitext(basename(intrees))
    outtrees = args["output"] if args[
        "output"] else treeName + "_shrunk" + treeExt

    mode = args["mode"] if args["mode"] else 'auto'

    k = int(args["k"]) if args["k"] else None

    outdir = args["outdir"] if args["outdir"] else splitext(
        intrees)[0] + "_treeshrink"
    mkdir(outdir)
    if args["tempdir"]:
        tempdir = args["tempdir"]
        mkdir(tempdir)
    else:
        tempdir = mkdtemp()  #check_output(["mktemp","-d"]).rstrip()

    trees = TreeList.get_from_path(intrees,
                                   'newick',
                                   preserve_underscores=True)
    if mode == 'auto' and len(trees) < MIN_TREE_NUM:
        print("There are only " + str(len(trees)) +
              " gene trees in the dataset.")
        print("TreeShrink will run in 'All-genes' mode")
        mode = 'all-genes'

    gene_list = [[] for i in range(len(trees))]
    species_map = {}
    occ = {}
    removing_sets = [[[] for i in range(len(trees))]
                     for j in range(len(quantiles))]

    for t, a_tree in enumerate(trees):
        # solve k-shrink
        a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"])
        a_filter.optFilter(d=k)

        # compute species feature (i.e. the max ratio associated with each species for this gene tree)
        mapping = {}
        for i in range(1, len(a_filter.min_diams)):
            r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
            removals = a_filter.list_removals(d=i)
            for s in removals:
                mapping[s] = r if s not in mapping else max(mapping[s], r)

        # gather per-species distributions and per-gene species features
        for s in mapping:
            if mode == 'per-species' or mode == 'auto':
                species_map[s] = [
                    mapping[s]
                ] if s not in species_map else species_map[s] + [mapping[s]]
            if mode == 'per-species' or mode == 'all-genes' or mode == 'auto':
                gene_list[t].append((s, mapping[s]))

        # fit kernel density to this gene's species features (per-gene mode)
        if mode == 'per-gene':
            filename = normpath(join(tempdir, "gene_" + str(t) + ".dat"))
            with open(filename, 'w') as f:
                for s in mapping:
                    f.write(str(mapping[s]))
                    f.write("\n")
                #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping)
                #for i in range(n_missing):
                #    f.write("1.0")
                #    f.write("\n")
            if len(mapping) > 1:
                for i, q in enumerate(quantiles):
                    threshold = float(
                        check_output([
                            "Rscript",
                            normpath(
                                join(wdir, "R_scripts",
                                     "find_threshold_loglnorm.R")), filename, q
                        ]).lstrip().rstrip()[4:])
                    #print("Threshold: ", threshold)
                    for s in mapping:
                        if mapping[s] > threshold:
                            removing_sets[i][t].append(s)
        # update taxon occupancy (only for per-species mode)
        if mode == 'per-species' or mode == 'auto':
            for n in a_tree.leaf_node_iter():
                s = n.taxon.label
                occ[s] = 1 if not s in occ else occ[s] + 1

    if mode == 'auto' or mode == 'per-species':
        flag = False
        for s in occ:
            if occ[s] < MIN_OCC:
                print("Species " + s + " only exists in " + str(occ[s]) +
                      " gene trees")
                flag = True
        if flag:
            if mode == 'auto':
                mode = 'all-genes'
                print(
                    "There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode"
                )
            else:
                print(
                    "WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode"
                )
        elif mode == 'auto':
            mode = 'per-species'
            print(
                "Finish preprocessing. TreeShrink will run in 'Per-species' mode"
            )

# fit kernel density to the per-species distributions and compute per-species threshold (per-species mode)
    if mode == 'per-species':
        for s in species_map:
            l = len(species_map[s])
            for i in range(occ[s] - l):
                species_map[s].append(1)
            filename = normpath(join(tempdir, s + ".dat"))
            with open(filename, 'w') as f:
                for v in species_map[s]:
                    f.write(str(v))
                    f.write("\n")
            thresholds = [0 for i in range(len(quantiles))]
            for i, q in enumerate(quantiles):
                thresholds[i] = float(
                    check_output([
                        "Rscript",
                        normpath(
                            join(wdir, "R_scripts",
                                 "find_threshold_lkernel.R")), wdir, filename,
                        q
                    ]).lstrip().rstrip()[5:])
            species_map[s] = (species_map[s], thresholds)

        for t, gene in enumerate(gene_list):
            for s, r in gene:
                for i, threshold in enumerate(species_map[s][1]):
                    if r > threshold:
                        removing_sets[i][t].append(s)

# fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode)
    if mode == 'all-genes':
        filename = normpath(join(tempdir, "all_genes" + ".dat"))
        with open(filename, 'w') as f:
            for gene in gene_list:
                for s, r in gene:
                    f.write(str(r))
                    f.write("\n")
        for i, q in enumerate(quantiles):
            threshold = float(
                check_output([
                    "Rscript",
                    normpath(
                        join(wdir, "R_scripts", "find_threshold_lkernel.R")),
                    wdir, filename, q
                ]).lstrip().rstrip()[5:])
            for t, gene in enumerate(gene_list):
                for s, r in gene:
                    if r > threshold:
                        removing_sets[i][t].append(s)

# Dendropy's filter_leaf_nodes() seems to have problem
# i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration)
# use home-made code to prune the tree instead

    treeName, treeExt = splitext(outtrees)
    fName, ext = splitext(outtrees)
    for i, RS in enumerate(removing_sets):
        trees_shrunk = deepcopy(trees)
        outfile = normpath(join(outdir,
                                fName + "_RS_" + quantiles[i] + ".txt"))
        with open(outfile, 'w') as f:
            for item in RS:
                for s in item:
                    f.write(s + "\t")
                f.write("\n")
        for t, tree in enumerate(trees_shrunk):
            #filt = lambda node: False if (node.taxon is not None and node.taxon.label in RS[t]) else True
            #tree.filter_leaf_nodes(filt,update_bipartitions=True)
            prune_tree(tree, RS[t])
        trees_shrunk.write_to_path(
            normpath(join(outdir, treeName + "_" + quantiles[i] + treeExt)),
            'newick')

    if not args["tempdir"]:
        rmtree(tempdir)


#    call(["rm","-r",tempdir])

    print("Output files written to " + outdir)
Exemple #7
0
            
            
parser = ArgumentParser('Return CP- or CPM-vectors for a set of trees\n'+
                        'The vectors are written to a separate file each,\n'+
                        'named {tree_file}.tree_{tree_number}.vector')
parser.add_argument('-t', type=str, help='Tree file in Newick format')
parser.add_argument('-u', action='store_true',
                    help='Produce unrooted (CPM) labelling')
parser.add_argument('--hash', action='store_true',
                    help='Produce hashed labelling')
parser.add_argument('--processes', type=int, default=0,
                    help='Number of processes. Defaults to processor number')
args = parser.parse_args()

start = time()
process_count = args.processes if args.processes else cpu_count()
print('Using {} processes'.format(process_count), file=stderr)
file_mask = args.t.split('.')[0]+'_tree{}.vector'
trees = TreeList.get_from_path(args.t, schema='newick')
print('Loaded {} trees'.format(len(trees)), file=stderr)
counter = 0
f = args.u and leaf_enumeration_annotation or annotate_rooted_tree
func_args = [(trees[i], f, file_mask.format(str(i)), args.hash) for i in range(len(trees))]
p = Pool(process_count)
_ = p.starmap(write_tree, func_args, chunksize=1)
print('Processed {} trees in {} seconds using {} processes'.format(
                                                                str(len(trees)),
                                                                time()-start,
                                                                process_count),
      file=stderr)