def check_mulrf_scores(sfile, gfile, mulrf): """ Checks RF scores are the same regardless of preprocessing gene family trees Parameters ---------- sfile : string name of file containing species tree gfile : string name of file containing gene family trees mulrf: string name including full path of MulRFScorer binary """ # Read species tree stree = treeswift.read_tree(sfile, "newick") remove_internal_node_labels(stree) stree.suppress_unifurcations() total_rf = 0 with open(gfile, 'r') as f: g = 1 for line in f.readlines(): temp = "".join(line.split()) # Build MUL-tree mtree = treeswift.read_tree_newick(temp) remove_internal_node_labels(mtree) unroot(mtree) # Build pre-processed MUL-tree mxtree = treeswift.read_tree(temp, "newick") remove_internal_node_labels(mxtree) [nEM, nLM, nR, c, nEMX, nLMX] = preprocess_multree(mxtree) score_shift = compute_score_shift(nEM, nLM, nR, c, nEMX, nLMX) # Compute MulRF scores temp = gfile.rsplit('.', 1)[0] mscore = score_with_MulRF(mulrf, stree, mtree, temp + "-scored") mxscore = score_with_MulRF(mulrf, stree, mxtree, temp + "-preprocessed-and-scored") # Check scores match! if mxscore + score_shift != mscore: sys.exit("Gene tree on line %d failed!\n" % g) total_rf += mscore g += 1 sys.stdout.write('%d\n' % total_rf) sys.stdout.flush() os._exit(0) # CRITICAL ON BLUE WATERS LOGIN NODE
def __init__(self, args, calculate_distance_matrix=False): self.args = args print('Loding data...') backbone_tree_file = args.backbone_tree_file backbone_seq_file = args.backbone_seq_file self_seq = SeqIO.to_dict(SeqIO.parse(backbone_seq_file, "fasta")) tree = treeswift.read_tree(backbone_tree_file, 'newick') # self.nodes = list(self_seq.keys()) print('finish data loading!') args.sequence_length = len(list(self_seq.values())[0]) L = args.sequence_length if calculate_distance_matrix: print('Calculating distance matrix...') self.distance_matrix = tree.distance_matrix(leaf_labels=True) for key in self.distance_matrix: self.distance_matrix[key][key] = 0 self.distance_matrix = pd.DataFrame.from_dict(self.distance_matrix) print('Finish distance matrix calculation!') self.nodes, self.seq, self.mask = utils.process_seq(self_seq, args, True, True) self.seq = dict(zip(self.nodes, self.seq)) self.nongaps = dict(zip(self.nodes, self.mask)) self.num = len(self.nodes)
def relabel_multrees_simphy(ifil, ofil): """ Relabels leaves of locus or gene trees generated by SimPhy; specifically, [sid]_[lid]_[gid] is relabled to [sid]. Also, removes internal node labels and branch lengths. Parameters ---------- ifil : string name of input file (one newick string per line) ofil : string name of output file (one newick string per line) """ with open(ifil, 'r') as fi, open(ofil, 'w') as fo: for line in fi.readlines(): temp = "".join(line.split()) tree = treeswift.read_tree(temp, "newick") for node in tree.traverse_postorder(): if node.is_leaf(): node.label = node.label.split('_')[0] else: node.label = None node.edge_length = None fo.write(tree.newick()) fo.write('\n')
def main(args): t = treeswift.read_tree(args.tree, 'newick') t.collapse_short_branches(args.min_branch_length) # Leaves aren't shortened by collapse_short_branch_lengths() # so shorten them manually by iterating over all leaves for node in t.traverse_leaves(): if node.get_edge_length() <= args.min_branch_length: node.set_edge_length(0) # If we use the default ascending=True, this breaks TreeCluster clustering t.order("num_descendants_then_edge_length_then_label", ascending=False) t.resolve_polytomies() print(t)
def prepareTree(options): if options.reestimate_backbone: # reestimate backbone branch lengths reestimate_backbone(options) start = time.time() first_read_tree = ts.read_tree(options.tree_fp, schema='newick') logging.info("[%s] Tree is parsed in %.3f seconds." % (time.strftime("%H:%M:%S"), (time.time() - start))) start = time.time() util.index_edges(first_read_tree) util.set_levels(first_read_tree) # create a dictionary where keys are leaf labels and values are # pendant edge index for that leaf name_to_node_map = {} for l in first_read_tree.traverse_postorder(internal=False): name_to_node_map[l.label] = l extended_newick_string = extended_newick(first_read_tree) logging.info("[%s] Tree preprocessing is completed in %.3f seconds." % (time.strftime("%H:%M:%S"), (time.time() - start))) return first_read_tree, name_to_node_map, extended_newick_string
def main(): args_base = OmegaConf.create(default_config.default_config) args_cli = OmegaConf.from_cli() args = OmegaConf.merge(args_base, args_cli) original_distance = pd.read_csv(os.path.join(args.outdir, "depp.csv"), sep='\t') a_for_seq_name = pd.read_csv(os.path.join(args.outdir, "depp.csv"), sep='\t', dtype=str) s = list(original_distance.keys())[1:] tree = treeswift.read_tree(args.backbone_tree, 'newick') true_max = tree.diameter() # print(true_max) data = {} s_set = set(s) for i in range(len(original_distance)): line = list(a_for_seq_name.iloc[i]) seq_name = line[0] with open(f"{args.outdir}/depp_tmp/{seq_name}_leaves.txt", "r") as f: method = set(f.read().split("\n")) method.remove('') method = method.intersection(s_set) if method: query_median = np.median( original_distance[np.array(method)].iloc[i]) ratio = true_max / (query_median + 1e-7) # print(ratio) b = original_distance.iloc[i].values[1:] * ratio else: b = original_distance.iloc[i].values[1:] seq_dict = dict(zip(s, b)) data[seq_name] = seq_dict data = pd.DataFrame.from_dict(data, orient='index', columns=s) data.to_csv(os.path.join(args.outdir, f'depp_correction.csv'), sep='\t')
def read_dismat(f): tags = list(re.split("\s+", f.readline().rstrip()))[1:] for line in f.readlines(): dists = list(re.split("\s+", line.strip())) query_name = dists[0] obs_dist = dict(zip(tags, map(float, dists[1:]))) yield (query_name, None, obs_dist) queries = read_dismat(f) f = open(tree_fp) tree_string = f.readline() f.close() first_read_tree = ts.read_tree(tree_string, schema='newick') util.index_edges(first_read_tree) extended_newick_string = extended_newick(first_read_tree) treecore = Core(first_read_tree) treecore.init() second_read_tree = ts.read_tree(tree_string, schema='newick') util.index_edges(second_read_tree) treecore_frag = Core(second_read_tree) pool = mp.Pool(num_thread) results = pool.starmap(runquery, queries) result = join_jplace(results) result["tree"] = extended_newick_string result["metadata"] = {"invocation": " ".join(sys.argv)} result["fields"] = [
if not tree_fp: tree_fp = tempfile.NamedTemporaryFile(delete=True, mode='w+t').name dist_phy = tempfile.NamedTemporaryFile(delete=True, mode='w+t') nldef = tempfile.NamedTemporaryFile(delete=True, mode='w+t') dist_phy.write(write_phylip_dist(obs_dist)) dist_phy.flush() s = ["fastme", "-i", dist_phy.name, "-o", tree_fp] subprocess.call(s, stdout=nldef, stderr=nldef) if treeout_fp: copyfile(tree_fp, treeout_fp + "/tree.nwk") treestr = open(tree_fp).readline().strip() tree = tw.read_tree(treestr, "newick") pdc = tree.distance_matrix(leaf_labels=True) try: errs = dict() glob = 0 glob_fm = 0 errs_fm = dict() for l1 in tree.labels(leaves=True, internal=False): tot = 0 tot_fm = 0 for l2 in tree.labels(leaves=True, internal=False): if not l1 == l2: cont = (pdc[l1][l2] - obs_dist[l1][l2])**2 if cont > 0 and obs_dist[l1][l2] > 0: tot += cont
def reestimate_backbone(options): assert options.ref_fp start = time.time() orig_branch_tree = ts.read_tree(options.tree_fp, schema='newick') if len(orig_branch_tree.root.children) > 2: # 3 rooted = False else: rooted = True orig_branch_tree.suppress_unifurcations() if len(orig_branch_tree.root.children) > 3: # polytomy at the root orig_branch_tree.resolve_polytomies() else: # root node is ok, resolve the other nodes for i in orig_branch_tree.root.children: i.resolve_polytomies() all_branches_have_length = True for n in orig_branch_tree.traverse_postorder(internal=True, leaves=True): if not n.is_root() and n.edge_length is None: all_branches_have_length = False break if rooted and all_branches_have_length: left, right = orig_branch_tree.root.children if left.children: thetwo = [next(c.traverse_postorder(internal=False)) for c in left.children] theone = [next(right.traverse_postorder(internal=False))] lengthtwoside = left.edge_length lengthoneside = right.edge_length else: thetwo = [next(c.traverse_postorder(internal=False)) for c in right.children] theone = [next(left.traverse_postorder(internal=False))] lengthtwoside = right.edge_length lengthoneside = left.edge_length orig_branch_resolved_fp = tempfile.NamedTemporaryFile(delete=True, mode='w+t').name orig_branch_tree.write_tree_newick(orig_branch_resolved_fp) if _platform == "darwin": fasttree_exec = pkg_resources.resource_filename('apples', "tools/FastTree-darwin") elif _platform == "linux" or _platform == "linux2": fasttree_exec = pkg_resources.resource_filename('apples', "tools/FastTree-linux") elif _platform == "win32" or _platform == "win64" or _platform == "msys": fasttree_exec = pkg_resources.resource_filename('apples', "tools/FastTree.exe") else: # Unrecognised system raise ValueError('Your system {} is not supported yet.' % _platform) bb_fp = tempfile.NamedTemporaryFile(delete=False, mode='w+t') fasttree_log = tempfile.NamedTemporaryFile(delete=False, mode='w+t').name logging.info("FastTree log file is located here: %s" % fasttree_log) s = [fasttree_exec, "-nosupport", "-nome", "-noml", "-log", fasttree_log, "-intree", orig_branch_resolved_fp] if not options.protein_seqs: s.append("-nt") with open(options.ref_fp, "r") as rf: with Popen(s, stdout=PIPE, stdin=rf, stderr=sys.stderr) as p: #options.tree_fp = bb_fp.name tree_string = p.stdout.read().decode('utf-8') if rooted and all_branches_have_length: ft = ts.read_tree_newick(tree_string) for n in ft.traverse_postorder(internal=False): if n.label == theone[0].label: theone_inft = n break ft.reroot(theone_inft) mrca = ft.mrca([n.label for n in thetwo]) mrca_edge_length = mrca.edge_length ft.reroot(mrca, length=mrca_edge_length/2) if lengthtwoside+lengthoneside > 0: for i in range(2): if ft.root.children[i] == mrca: ft.root.children[i].edge_length = mrca_edge_length*lengthtwoside/(lengthtwoside+lengthoneside) ft.root.children[1-i].edge_length = mrca_edge_length*lengthoneside/(lengthtwoside+lengthoneside) ft.is_rooted = False tree_string = str(ft) with open(bb_fp.name, "w") as ntree: ntree.write(tree_string.strip()) ntree.write("\n") options.tree_fp = bb_fp.name logging.info( "[%s] Reestimated branch lengths in %.3f seconds." % (time.strftime("%H:%M:%S"), (time.time() - start)))
bb_fp = tempfile.NamedTemporaryFile(delete=True, mode='w+t') fasttree_log = tempfile.NamedTemporaryFile(delete=True, mode='w+t').name s = [ fasttree_exec, "-nosupport", "-nome", "-noml", "-log", fasttree_log, "-intree", orig_branch_resolved_fp ] if not options.protein_seqs: s.append("-nt") with open(uniq_ref, "r") as rf: with Popen(s, stdout=PIPE, stdin=rf, stderr=sys.stderr) as p: tree_string = p.stdout.read().decode('utf-8') print(tree_string) uniqs_tree_nj = ts.read_tree(tree_string, schema="newick") leaf_map = dict() for n in uniqs_tree_nj.traverse_postorder(internal=False): leaf_map[n.label] = n for k, v in uniqtags.items(): seq, mrca = v if len(seqdict[seq]) == 1: continue existing = leaf_map[k] o_node_parent = existing.parent o_node_parent.remove_child(existing) mrca_copy = nodecopy(mrca) mrca_copy.edge_length = existing.edge_length o_node_parent.add_child(mrca_copy)