def dendropy_read_treefile(treefiles, quiet=False, preserve_underscores=False, **kwargs): out_stream = kwargs.pop('writer', sys.stderr) intrees = TreeList() if not treefiles: if not quiet: sys.stderr.write('NOTE: reading trees from stdin\n') trees = sys.stdin.read() #try two input formats try: intrees.extend(TreeList.get_from_string(trees, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) except (DataParseError, NexusReader.NotNexusFileError) as e: sys.stderr.write('%s\n' % e.message) intrees.extend(TreeList.get_from_string(trees, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError) as e: if not quiet: sys.stderr.write('%s\n' % e.message) sys.exit('Could not read file %s in nexus or newick format ...\n' % tf) else: for tf in treefiles: if not os.path.isfile(tf): out_stream.write('TreeFile %s does not exist' % tf) sys.exit() #try two input formats try: if not quiet: out_stream.write('Reading file %s in nexus format ...\n' % tf) intrees.extend(TreeList.get_from_path(tf, "nexus", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) #except (DataParseError, dendropy.dataio.nexusreader.NotNexusFileError) as e: except (DataParseError, NexusReader.NotNexusFileError, AttributeError) as e: try: if not quiet: out_stream.write('Reading file %s in newick format ...\n' % tf) intrees.extend(TreeList.get_from_path(tf, "newick", case_sensitive_taxon_labels=True, preserve_underscores=preserve_underscores, **kwargs)) except (DataParseError, Tokenizer.UnexpectedEndOfStreamError, AttributeError) as e: if not quiet: sys.stderr.write('%s\n' % e.message) sys.exit('Could not read file %s in nexus or newick format ...\n' % tf) return intrees
required=True, help="Sampling time") parser.add_argument("-r", "--rootAge", required=False, help="Root age") parser.add_argument("-t", "--timeTree", required=True, help="The output trees with branch lengths in time unit") parser.add_argument("-c", "--composite", required=False, action='store_true', help="Do composite optimization. Default: NO") args = vars(parser.parse_args()) myTrees = TreeList.get_from_path(args["input"], 'newick') smpl_times = {} rootAge = float(args["rootAge"]) if args["rootAge"] else None with open(args["samplingTime"], "r") as fin: fin.readline() for line in fin: name, time = line.split() smpl_times[name] = float(time) for tree in myTrees: if args["composite"]: s = calibrate_composite_opt(tree, smpl_times, root_age=rootAge) else: s = calibrate_log_opt(tree, smpl_times,
def do_sim(birth_rate , death_rate, num_leaves, rng=None): temp_dir = tempfile.mkdtemp() model_tree = treesim.birth_death(birth_rate=birth_rate, death_rate=death_rate, ntax=num_leaves, rng=rng) ################################################################################ # Calling seq-gen mtf = os.path.join(temp_dir, 'simtree') print "temp_dir =", temp_dir treefile_obj = open(mtf, 'w') treefile_obj.write("%s;\n" % str(model_tree)) # CLOSING THE FILE IS IMPORTANT! This flushes buffers, assuring that the data # will be written to the filesystem before seq-gen is invoked. treefile_obj.close() import subprocess command_line = ['seq-gen', '-mHKY', '-on', ] if os.environ.get('TREE_INF_TEST_RAND_NUMBER_SEED'): sg_seed = seed else: if rng is None: sg_seed = random.randint(0,100000) else: sg_seed = rng.randint(0,100000) command_line.append('-z%d' % sg_seed) command_line.append('simtree') seq_gen_proc = subprocess.Popen(command_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=temp_dir) dataset = seq_gen_proc.communicate()[0] # seq-gen does not exit with an error code when it fails. I don't know why!! if seq_gen_proc.returncode != 0 or len(dataset) == 0: sys.exit('seq-gen failed!\n') sd = os.path.join(temp_dir, 'simdata.nex') d = open(sd, 'w') d.write(dataset) # CLOSING THE FILE IS IMPORTANT! This flushes buffers, assuring that the data # will be written to the filesystem before PAUP is invoked. d.close() ################################################################################ # PAUP pcf = os.path.join(temp_dir, 'execute_paup.nex') pc = open(pcf, 'w') pc.write('''execute simdata.nex ; hsearch nomultrees ; savetree file=inferred.tre format = NEXUS; quit; ''') pc.close() paup_proc = subprocess.Popen(['paup', '-n', pcf], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=temp_dir) (o, e) = paup_proc.communicate() paup_output = os.path.join(temp_dir, 'inferred.tre') # seq-gen does not exit with an error code when it fails. I don't know why!! if paup_proc.returncode != 0 or not os.path.exists(paup_output): sys.exit(e) # read true tree with the inferred tree (because it is nexus) inf_tree_list = TreeList.get_from_path(paup_output, "NEXUS", taxon_set=model_tree.taxon_set) assert len(inf_tree_list) == 1 inferred_tree = inf_tree_list[0] # determine which splits were missed treesplit.encode_splits(inferred_tree) treesplit.encode_splits(model_tree) missing = model_tree.find_missing_splits(inferred_tree) # sort the nodes of the true tree by depth and ask whether or not they were recovered node_depth_TF_list = [] for node in model_tree.postorder_node_iter(): children = node.child_nodes() if children and node.parent_node: first_child = children[0] node.depth = first_child.depth + first_child.edge.length if node.edge.split_bitmask in missing: recovered = 0 else: recovered = 1 node_depth_TF_list.append((node.depth, node.edge.length, recovered)) else: node.depth = 0.0 node_depth_TF_list.sort() os.remove(pcf) os.remove(paup_output) os.remove(sd) os.remove(mtf) os.rmdir(temp_dir) return node_depth_TF_list
mode = args["mode"] if args["mode"] else 'per-species' print(mode) k = int(args["k"]) if args["k"] else None outdir = args["outdir"] if args["outdir"] else splitext( intrees)[0] + "_kshrink" mkdir(outdir) if args["tempdir"]: tempdir = args["tempdir"] mkdir(tempdir) else: tempdir = check_output(["mktemp", "-d"]).rstrip() trees = TreeList.get_from_path(intrees, 'newick', preserve_underscores=True) gene_list = [[] for i in range(len(trees))] species_map = {} occ = {} removing_sets = [[[] for i in range(len(trees))] for j in range(len(quantiles))] for t, a_tree in enumerate(trees): # solve k-shrink a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"]) a_filter.optFilter(d=k) # compute species feature (i.e. the max ratio associated with each species for this gene tree) mapping = {} for i in range(1, len(a_filter.min_diams)): r = a_filter.min_diams[i - 1] / a_filter.min_diams[i]
import dendropy from dendropy import TreeList,Taxon,Node import sys import argparse parser = argparse.ArgumentParser(description="Parses a Newick tree file, modifying the branch lengths from number of generations to years and adding an outgroup") parser.add_argument("-gt",type=float,default=0,required=False,help="Generation time") parser.add_argument("-od",type=float,default=0,required=False,help="Outgroup branch length") parser.add_argument("-i",type=str,default="infile.tree",required=True,help="Input Newick tree file") parser.add_argument("-o",type=str,default="outtree.tree",required=False,help="Output Newick tree file") args = parser.parse_args() trees=TreeList.get_from_path(args.i,schema="newick",rooting="force-rooted") if args.gt != 0: print "Scaling branch lengths to time with generation time %d\n" % args.gt for tree in trees: for edge in tree.preorder_edge_iter(): #print "DEBUG: %s" % edge.length if edge.length != None: edge.length=edge.length/args.gt if args.od != 0: print "Adding outgroup with branch length %d\n" % args.od namespace=trees.taxon_namespace outgroup= Taxon("outgroup") namespace.add_taxon(outgroup) ntree=0 labels=namespace.labels() labels.remove("outgroup") for tree in trees: outgroup_node=Node(taxon=outgroup,edge_length=args.od)
def main(): import treeshrink from treeshrink.optimal_filter_lib import TreeFilter from treeshrink.tree_lib import prune_tree from sys import argv, stdout from math import sqrt from subprocess import check_output, call import argparse from dendropy import Tree, TreeList from os.path import basename, dirname, splitext, realpath, join, normpath from os import mkdir, getcwd, rmdir from copy import deepcopy from tempfile import mkdtemp from shutil import rmtree import dendropy print("Launching " + treeshrink.PROGRAM_NAME + " version " + treeshrink.PROGRAM_VERSION) parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input trees") parser.add_argument( "-d", "--outdir", required=False, help="Output directory. Default: inferred from the input trees") parser.add_argument( "-t", "--tempdir", required=False, help= "Directory to keep temporary files. If specified, the temp files will be kept" ) parser.add_argument( "-o", "--output", required=False, help= "The name of the output trees. Default: inferred from the input trees") parser.add_argument( "-c", "--centroid", required=False, action='store_true', help= "Do centroid reroot in preprocessing. Highly recommended for large trees. Default: NO" ) parser.add_argument( "-k", "--k", required=False, help= "The maximum number of leaves that can be removed. Default: auto-select based on the data" ) parser.add_argument( "-q", "--quantiles", required=False, help="The quantile(s) to set threshold. Default is 0.05") parser.add_argument( "-m", "--mode", required=False, help= "Filtering mode: 'per-species', 'per-gene', 'all-genes','auto'. Default: auto" ) wdir = dirname(realpath(__file__)) args = vars(parser.parse_args()) MIN_OCC = 20 MIN_TREE_NUM = 20 quantiles = [q for q in args["quantiles"].split() ] if args["quantiles"] else ["0.05"] #print(quantiles) intrees = args["input"] treeName, treeExt = splitext(basename(intrees)) outtrees = args["output"] if args[ "output"] else treeName + "_shrunk" + treeExt mode = args["mode"] if args["mode"] else 'auto' k = int(args["k"]) if args["k"] else None outdir = args["outdir"] if args["outdir"] else splitext( intrees)[0] + "_treeshrink" mkdir(outdir) if args["tempdir"]: tempdir = args["tempdir"] mkdir(tempdir) else: tempdir = mkdtemp() #check_output(["mktemp","-d"]).rstrip() trees = TreeList.get_from_path(intrees, 'newick', preserve_underscores=True) if mode == 'auto' and len(trees) < MIN_TREE_NUM: print("There are only " + str(len(trees)) + " gene trees in the dataset.") print("TreeShrink will run in 'All-genes' mode") mode = 'all-genes' gene_list = [[] for i in range(len(trees))] species_map = {} occ = {} removing_sets = [[[] for i in range(len(trees))] for j in range(len(quantiles))] for t, a_tree in enumerate(trees): # solve k-shrink a_filter = TreeFilter(ddpTree=a_tree, centroid_reroot=args["centroid"]) a_filter.optFilter(d=k) # compute species feature (i.e. the max ratio associated with each species for this gene tree) mapping = {} for i in range(1, len(a_filter.min_diams)): r = a_filter.min_diams[i - 1] / a_filter.min_diams[i] removals = a_filter.list_removals(d=i) for s in removals: mapping[s] = r if s not in mapping else max(mapping[s], r) # gather per-species distributions and per-gene species features for s in mapping: if mode == 'per-species' or mode == 'auto': species_map[s] = [ mapping[s] ] if s not in species_map else species_map[s] + [mapping[s]] if mode == 'per-species' or mode == 'all-genes' or mode == 'auto': gene_list[t].append((s, mapping[s])) # fit kernel density to this gene's species features (per-gene mode) if mode == 'per-gene': filename = normpath(join(tempdir, "gene_" + str(t) + ".dat")) with open(filename, 'w') as f: for s in mapping: f.write(str(mapping[s])) f.write("\n") #n_missing = len(list(a_tree.leaf_node_iter())) - len(mapping) #for i in range(n_missing): # f.write("1.0") # f.write("\n") if len(mapping) > 1: for i, q in enumerate(quantiles): threshold = float( check_output([ "Rscript", normpath( join(wdir, "R_scripts", "find_threshold_loglnorm.R")), filename, q ]).lstrip().rstrip()[4:]) #print("Threshold: ", threshold) for s in mapping: if mapping[s] > threshold: removing_sets[i][t].append(s) # update taxon occupancy (only for per-species mode) if mode == 'per-species' or mode == 'auto': for n in a_tree.leaf_node_iter(): s = n.taxon.label occ[s] = 1 if not s in occ else occ[s] + 1 if mode == 'auto' or mode == 'per-species': flag = False for s in occ: if occ[s] < MIN_OCC: print("Species " + s + " only exists in " + str(occ[s]) + " gene trees") flag = True if flag: if mode == 'auto': mode = 'all-genes' print( "There are species with low occupancy in the dataset. TreeShrink will run in 'All-genes' mode" ) else: print( "WARNING: 'Per-species' mode was selected for a dataset having low occupancy species. Consider switching to 'All-genes' mode" ) elif mode == 'auto': mode = 'per-species' print( "Finish preprocessing. TreeShrink will run in 'Per-species' mode" ) # fit kernel density to the per-species distributions and compute per-species threshold (per-species mode) if mode == 'per-species': for s in species_map: l = len(species_map[s]) for i in range(occ[s] - l): species_map[s].append(1) filename = normpath(join(tempdir, s + ".dat")) with open(filename, 'w') as f: for v in species_map[s]: f.write(str(v)) f.write("\n") thresholds = [0 for i in range(len(quantiles))] for i, q in enumerate(quantiles): thresholds[i] = float( check_output([ "Rscript", normpath( join(wdir, "R_scripts", "find_threshold_lkernel.R")), wdir, filename, q ]).lstrip().rstrip()[5:]) species_map[s] = (species_map[s], thresholds) for t, gene in enumerate(gene_list): for s, r in gene: for i, threshold in enumerate(species_map[s][1]): if r > threshold: removing_sets[i][t].append(s) # fit kernel density to all the species features across all genes and compute the global threshold (all-gene mode) if mode == 'all-genes': filename = normpath(join(tempdir, "all_genes" + ".dat")) with open(filename, 'w') as f: for gene in gene_list: for s, r in gene: f.write(str(r)) f.write("\n") for i, q in enumerate(quantiles): threshold = float( check_output([ "Rscript", normpath( join(wdir, "R_scripts", "find_threshold_lkernel.R")), wdir, filename, q ]).lstrip().rstrip()[5:]) for t, gene in enumerate(gene_list): for s, r in gene: if r > threshold: removing_sets[i][t].append(s) # Dendropy's filter_leaf_nodes() seems to have problem # i.e. it produces the trees that the treecmp tool cannot compute the MS distance (need further exploration) # use home-made code to prune the tree instead treeName, treeExt = splitext(outtrees) fName, ext = splitext(outtrees) for i, RS in enumerate(removing_sets): trees_shrunk = deepcopy(trees) outfile = normpath(join(outdir, fName + "_RS_" + quantiles[i] + ".txt")) with open(outfile, 'w') as f: for item in RS: for s in item: f.write(s + "\t") f.write("\n") for t, tree in enumerate(trees_shrunk): #filt = lambda node: False if (node.taxon is not None and node.taxon.label in RS[t]) else True #tree.filter_leaf_nodes(filt,update_bipartitions=True) prune_tree(tree, RS[t]) trees_shrunk.write_to_path( normpath(join(outdir, treeName + "_" + quantiles[i] + treeExt)), 'newick') if not args["tempdir"]: rmtree(tempdir) # call(["rm","-r",tempdir]) print("Output files written to " + outdir)
parser = ArgumentParser('Return CP- or CPM-vectors for a set of trees\n'+ 'The vectors are written to a separate file each,\n'+ 'named {tree_file}.tree_{tree_number}.vector') parser.add_argument('-t', type=str, help='Tree file in Newick format') parser.add_argument('-u', action='store_true', help='Produce unrooted (CPM) labelling') parser.add_argument('--hash', action='store_true', help='Produce hashed labelling') parser.add_argument('--processes', type=int, default=0, help='Number of processes. Defaults to processor number') args = parser.parse_args() start = time() process_count = args.processes if args.processes else cpu_count() print('Using {} processes'.format(process_count), file=stderr) file_mask = args.t.split('.')[0]+'_tree{}.vector' trees = TreeList.get_from_path(args.t, schema='newick') print('Loaded {} trees'.format(len(trees)), file=stderr) counter = 0 f = args.u and leaf_enumeration_annotation or annotate_rooted_tree func_args = [(trees[i], f, file_mask.format(str(i)), args.hash) for i in range(len(trees))] p = Pool(process_count) _ = p.starmap(write_tree, func_args, chunksize=1) print('Processed {} trees in {} seconds using {} processes'.format( str(len(trees)), time()-start, process_count), file=stderr)