def calculate_robinson_foulds(self, species_tree, gene_tree, weighted): """ Calculates the Robinson Foulds distances for weighted and unweighted trees. Input: species_tree -- newick file or newick string containing the species tree gene_tree -- newick file or newick string containing the tree to be compared to the species tree weighted -- boolean parameter for whether the files have weights Returns: The weighted and/or unweighted Robinson Foulds distance of the species tree and input tree. """ # taxon names tns = dendropy.TaxonNamespace() # Create dendropy tree from species tree input file if os.path.isfile(species_tree): species_tree = Tree.get_from_path(species_tree, 'newick', taxon_namespace=tns) # Create dendropy tree from species tree input newick string else: species_tree = Tree.get_from_string(species_tree, 'newick', taxon_namespace=tns) # Create dendropy tree from gene tree input file if os.path.isfile(gene_tree): gene_tree = Tree.get_from_path(gene_tree, 'newick', taxon_namespace=tns) # Create dendropy tree from gene tree input newick string else: gene_tree = Tree.get_from_string(gene_tree, 'newick', taxon_namespace=tns) # both weighted and unweighted foulds distance if weighted: return treecompare.weighted_robinson_foulds_distance(species_tree, gene_tree), \ treecompare.unweighted_robinson_foulds_distance(species_tree, gene_tree) # only unweighted foulds distance else: return treecompare.unweighted_robinson_foulds_distance( species_tree, gene_tree)
def collapse_short_analyses(dirstub, num): inf_dict = {'Euclidean': [], 'RF': []} for i in range(num): i += 1 diri = "{}{}".format(dirstub, i) tns = dendropy.TaxonNamespace() inputtree = dendropy.Tree.get_from_path( "{}/scaledtree.tre".format(diri), schema="newick", taxon_namespace=tns) for edge in inputtree.postorder_edge_iter(): if edge.length < 0.0000000001: edge.collapse() inferred = dendropy.Tree.get_from_string(trestr, schema="newick", taxon_namespace=tns) for edge in inferredtree.postorder_edge_iter(): if edge.length < 0.0000000001: edge.collapse() inputtree.encode_bipartitions() inferred.encode_bipartitions() inf_dict['Euclidean'].append( treecompare.euclidean_distance(inputtree, inferred)) inf_dict['RF'].append( treecompare.unweighted_robinson_foulds_distance( inputtree, inferred)) for key in inf_dict: mean = sum(inf_dict[key]) / len(inf_dict[key]) print(key) print(mean) return (inf_dict) #perform_sims(dirstub = "validation/short_fix/run", refloc = "example/short_ref.fasta") #perform_analyses("validation/short_fix/run")
def compute_distance(trees, true_tree): """ Computes Robinson-Foulds distance between input trees and "true" tree :param trees: dict of dendropy tree top be compared to the "true" tree :param true_tree: dentropy tree of the "true" tree :return: key:value dict where key is filename of tree """ distance_dict = { file: tc.unweighted_robinson_foulds_distance(tree, true_tree) for file, tree in trees.items() } return distance_dict
def calc_rfd_distribution(src_path): tns = dendropy.TaxonNamespace() trees = dendropy.TreeList.get( path=src_path, schema="nexus") rf_dists = [] for idx1, t1 in enumerate(trees[:-1]): for idx2, t2 in enumerate(trees[idx1+1:]): rfd = treecompare.unweighted_robinson_foulds_distance(t1, t2) rf_dists.append(rfd) mean, var = statistics.mean_and_sample_variance(rf_dists) print("mean = {}, var = {}, 5/95% quantile = {}".format( mean, var, statistics.quantile_5_95(rf_dists)))
def test_special_case1(self): original_tree_str = """\ [&R] ((((e1:4.25978504749,a0:4.25978504749):9.75100657322,(e5:11.2557415909,c9:11.2557415909):2.75505002977):5.25672273638,(c5:17.0225375511,e6:17.0225375511):2.24497680601):20.9755404109,(((c7:0.0433876754663,e4:0.0433876754663):16.2031718648,(b1:14.1628944123,d7:14.1628944123):2.08366512802):14.3825543479,((((d1:13.4235384066,(d4:7.64533761739,c3:7.64533761739):5.77820078917):2.00948796838,((d8:3.10025757397,b5:3.10025757397):5.07496414931,a4:8.17522172328):7.25780465166):4.52823355379,((((((a7:8.94718577977,(((a1:2.04048640276,c2:2.04048640276):1.45629935083,(e0:0.408302025932,b6:0.408302025932):3.08848372766):3.77714533326,(((c6:2.1238494561,(e8:2.03255428077,d6:2.03255428077):0.0912951753249):2.91822700988,a5:5.04207646598):1.92173681425,((a2:3.43218264885,(b8:0.515232535857,a9:0.515232535857):2.91695011299):1.6832785054,b4:5.11546115425):1.84835212598):0.310117806629):1.67325469292):0.613875266884,(d9:8.93428444448,(c1:5.91732320427,c8:5.91732320427):3.0169612402):0.626776602178):3.65721021136,((c0:3.99662328128,d2:3.99662328128):1.90572648225,(e9:1.84550535315,(b9:0.803660457957,e3:0.803660457957):1.0418448952):4.05684441038):7.31592149449):1.63573163655,d5:14.8540028946):2.25255068893,d3:17.1065535835):2.27795405888,(((a3:4.85356559967,(c4:3.08209866724,d0:3.08209866724):1.77146693244):2.74425153816,e7:7.59781713783):4.22596432824,(b2:2.86856170856,e2:2.86856170856):8.9552197575):7.56072617631):0.576752286338):1.05878660087,((b0:1.6464852541,b7:1.6464852541):10.0630186678,(a8:7.31781944487,(a6:7.13495568605,b3:7.13495568605):0.182863758824):4.39168447703):9.31054260769):9.60906735859):9.61394087983):6.65318140005; """ expected_tree_strs = """\ [&R] (a0:40.243054768,((a4:19.9612599287,((a7:8.94718577977,(a1:7.27393108685,(a5:6.96381328023,(a2:3.43218264885,a9:3.43218264885):3.53163063138):0.310117806629):1.67325469292):10.4373218626,a3:19.3845076424):0.576752286338):1.05878660087,(a8:7.31781944487,a6:7.31781944487):13.7022270847):19.2230082384):6.65318140005; [&R] (b1:30.6291138882,((b5:19.9612599287,(((b6:7.27393108685,(b8:5.11546115425,b4:5.11546115425):2.15846993261):5.94434017116,b9:13.218271258):6.16623638436,b2:19.3845076424):0.576752286338):1.05878660087,((b0:1.6464852541,b7:1.6464852541):10.0630186678,b3:11.7095039219):9.31054260769):9.60906735859):16.2671222799; [&R] ((c9:19.2675143571,c5:19.2675143571):20.9755404109,(c7:30.6291138882,(c3:19.9612599287,((((c2:7.27393108685,c6:7.27393108685):2.2871299598,(c1:5.91732320427,c8:5.91732320427):3.64373784238):3.65721021136,c0:13.218271258):6.16623638436,c4:19.3845076424):0.576752286338):10.6678539595):9.61394087983):6.65318140005; [&R] (d7:30.6291138882,(((d1:13.4235384066,d4:13.4235384066):2.00948796838,d8:15.4330263749):4.52823355379,(((((d6:9.56106104665,d9:9.56106104665):3.65721021136,d2:13.218271258):1.63573163655,d5:14.8540028946):2.25255068893,d3:17.1065535835):2.27795405888,d0:19.3845076424):0.576752286338):10.6678539595):16.2671222799; [&R] (((e1:14.0107916207,e5:14.0107916207):5.25672273638,e6:19.2675143571):20.9755404109,(e4:30.6291138882,(((e0:7.27393108685,e8:7.27393108685):5.94434017116,(e9:1.84550535315,e3:1.84550535315):11.3727659049):6.16623638436,(e7:11.8237814661,e2:11.8237814661):7.56072617631):11.2446062458):9.61394087983):6.65318140005; """ tns = dendropy.TaxonNamespace() source_tree1 = dendropy.Tree.get( data=original_tree_str, schema="newick", taxon_namespace=tns) source_tree2 = dendropy.Tree.get( data=original_tree_str, schema="newick", taxon_namespace=tns) self.assertEqual(treecompare.weighted_robinson_foulds_distance(source_tree1, source_tree2), 0.0) group_ids = ("a", "b", "c", "d", "e") expected_induced_trees = dendropy.TreeList.get( data=expected_tree_strs, schema="newick", taxon_namespace=tns) assert len(expected_induced_trees) == len(group_ids) for group_id, expected_induced_tree in zip(group_ids, expected_induced_trees): extracted_tree = source_tree1.extract_tree( node_filter_fn=lambda node: node.taxon.label.startswith(group_id), is_apply_filter_to_leaf_nodes=True, is_apply_filter_to_internal_nodes=False) for leaf_nd in extracted_tree.leaf_node_iter(): self.assertTrue(leaf_nd.taxon.label.startswith(group_id)) for leaf_nd in expected_induced_tree.leaf_node_iter(): assert leaf_nd.taxon.label.startswith(group_id) # self.assertEqual(treecompare.weighted_robinson_foulds_distance(source_tree1, source_tree2), 0.0) self.assertEqual(treecompare.unweighted_robinson_foulds_distance(extracted_tree, expected_induced_tree), 0) self.assertAlmostEqual(treecompare.weighted_robinson_foulds_distance(extracted_tree, expected_induced_tree), 0.0)
def test_special_case2(self): original_tree_str = """\ [&R] ((a1,(a2,(a3,a4)a0)),(b1,(b2,(b3,b4)))); """ source_tree1 = dendropy.Tree.get( data=original_tree_str, schema="newick", ) expected_tree_str = """\ [&R] ((a3,a4)a0); """ extracted_tree = source_tree1.extract_tree( # node_filter_fn=lambda node: node.taxon is not None and node.taxon.label.startswith("a"), node_filter_fn=lambda node: node.taxon.label in set(["a3", "a4"]), is_apply_filter_to_leaf_nodes=True, is_apply_filter_to_internal_nodes=False, ) expected_tree = dendropy.Tree.get( data=expected_tree_str, schema="newick", taxon_namespace=source_tree1.taxon_namespace) self.assertEqual(treecompare.unweighted_robinson_foulds_distance(extracted_tree, expected_tree), 0.0)
import dendropy from dendropy.calculate import treecompare from dendropy import Tree import os protein_dir_set = [] for i in os.listdir('output'): if "nex" in i.split("."): protein_dir_set.append(str(i)) tns = dendropy.TaxonNamespace() for i in range(0, len(protein_dir_set)): for j in range(i + 1, len(protein_dir_set)): tree1 = Tree.get_from_path("output/" + protein_dir_set[i], "nexus", taxon_namespace=tns) tree2 = Tree.get_from_path("output/" + protein_dir_set[j], "nexus", taxon_namespace=tns) tree1.encode_bipartitions() tree2.encode_bipartitions() print(protein_dir_set[i], protein_dir_set[j], treecompare.unweighted_robinson_foulds_distance(tree1, tree2))
def topology_counter(self, rooted=False, outgroup=None): """ Counts the number of times that each topology appears as outputted by running RAxML. Output: topologies_to_counts --- a dictionary mapping topologies to the number of times they appear """ # Initialize a dictionary mapping newick strings to unique topologies unique_topologies_to_newicks = {} # taxon names tns = dendropy.TaxonNamespace() # Create a set of unique topologies unique_topologies = set([]) # Get the topology files from the "Topologies" folder input_directory = "Topologies" # Initialize topology_count to a defaultdict topologies_to_counts = defaultdict(int) # Iterate over each file in the given directory for filename in os.listdir(input_directory): # Create a boolean flag for determining the uniqueness of tree new_tree_is_unique = True # If file is the file with the best tree newick string if os.path.splitext(filename)[0] == "Topology_bestTree": input_file = os.path.join(input_directory, filename) new_tree = Tree.get_from_path(input_file, 'newick', taxon_namespace=tns) if rooted: outgroup_node = new_tree.find_node_with_taxon_label( outgroup) new_tree.to_outgroup_position(outgroup_node, update_bipartitions=False) # Iterate over each topology in unique_topologies for unique_topology in unique_topologies: # Create a tree for each of the unique topologies calculate RF distance compared to new_tree unique_tree = Tree.get_from_string(unique_topology, 'newick', taxon_namespace=tns) rf_distance = treecompare.unweighted_robinson_foulds_distance( unique_tree, new_tree) # If the RF distance is 0 then the new tree is the same as one of the unique topologies if rf_distance == 0: topologies_to_counts[unique_topology] += 1 new_tree_is_unique = False new_tree = new_tree.as_string("newick").replace( "\n", "") unique_topologies_to_newicks[unique_topology].add( new_tree) break # If the new tree is a unique tree add it to the set of unique topologies if new_tree_is_unique: new_tree = new_tree.as_string("newick").replace("\n", "") unique_topologies.add(new_tree) topologies_to_counts[new_tree] += 1 unique_topologies_to_newicks[new_tree] = set([new_tree]) return topologies_to_counts, unique_topologies_to_newicks
def assert_equal_trees(self, t0, t1): self.assertEqual( treecompare.unweighted_robinson_foulds_distance(t0, t1), 0) self.assertAlmostEqual( treecompare.weighted_robinson_foulds_distance(t0, t1), 0, 8)
def assert_equal_trees(self, t0, t1): self.assertEqual(treecompare.unweighted_robinson_foulds_distance(t0, t1), 0) self.assertAlmostEqual(treecompare.weighted_robinson_foulds_distance(t0, t1), 0, 8)
def get_figure(software1, software2, output_path): """ This function generates a comparison figure of trees generated from software 1 and software 2 :param software1: name of the software :param software2: name of the software :param output_path: path to where the output figure will be saved :return: NA """ map = { 1: "PB2", 2: "PB1", 3: "PA", 4: "HA", 5: "NP", 6: "NA", 7: "MP", 8: "NS", 9: "concatenated" } wRF = [] uwRF = [] eD = [] for num1 in range(1, 10): w = [] u = [] e = [] for num2 in range(1, 10): segment1 = map[num1] segment2 = map[num2] groundTruthFile = get_file(software1, segment1) estimationFile = get_file(software2, segment2) tns = dendropy.TaxonNamespace() gtTree = dendropy.Tree.get(file=open(groundTruthFile, 'r'), schema='newick', taxon_namespace=tns) estimateTree = dendropy.Tree.get(file=open(estimationFile, 'r'), schema='newick', taxon_namespace=tns) # metrics, weighted RF is unsymmetric, unweighted RF is symmetric distance weightedRF = treecompare.weighted_robinson_foulds_distance( gtTree, estimateTree) unweightedRF = treecompare.unweighted_robinson_foulds_distance( gtTree, estimateTree) euclideanDist = treecompare.euclidean_distance( gtTree, estimateTree) w.append(weightedRF) u.append(unweightedRF) e.append(euclideanDist) wRF.append(w) uwRF.append(u) eD.append(e) wRF = np.array(wRF) uwRF = np.array(uwRF) eD = np.array(eD) metric_map = { "Weighted Robinson Foulds": wRF, "Unweighted Robinson Foulds": uwRF, "Euclidean Distances": eD } for metric in [ "Weighted Robinson Foulds", "Unweighted Robinson Foulds", "Euclidean Distances" ]: fig, ax = plt.subplots() im, cbar = heatmap(metric_map[metric], software1, software2, ax=ax, cmap="YlGn", cbarlabel="Distance") texts = annotate_heatmap(im, valfmt="{x:.2f}") title = "%s on %s and %s Tree" % (metric, software1.capitalize(), software2.capitalize()) ax.set_title(title, pad=-330) fig.tight_layout() # save figure to output path plt.savefig(output_path)
import dendropy from dendropy.calculate.treecompare import \ unweighted_robinson_foulds_distance, euclidean_distance, false_positives_and_negatives if __name__ == "__main__": usage = "Compute Robinson-Foulds distance between two trees in NEWICK format.\n\ Usage: python bin/compute_tree_dist.sh [First tree in newick format] [Second tree in newick format]\n\ Example: python bin/compute_tree_dist.sh 1.tree 2.tree" if len(sys.argv) != 3 or sys.argv[1] == "-h" or sys.argv[1] == "--help": print(usage, file=sys.stderr) sys.exit(1) # Parse arguments tree_files = [os.path.abspath(x) for x in sys.argv[1:]] # Read trees tns = dendropy.TaxonNamespace() trees = tuple([ dendropy.Tree.get(path=x, schema="newick", taxon_namespace=tns) for x in tree_files ]) print(unweighted_robinson_foulds_distance(*trees)) #print(euclidean_distance(*trees)) # d = false_positives_and_negatives(*trees) # print(np.sqrt(np.float(d[0])*np.float(d[1]))) sys.exit(0)
def perform_analyses(dirstub, num): inf_dict = { 'Euclidean': [], 'RF': [], 'MutsSim': [], 'MutsCalled': [], 'freqA': [], 'freqC': [], 'freqG': [], 'freqT': [], 'ac': [], 'ag': [], 'at': [], 'cg': [], 'ct': [], 'gt': [] } for i in range(num): i += 1 diri = "{}{}".format(dirstub, i) #diri = "{}{}/altref".format(dirstub,i) cwd = os.getcwd() os.chdir(diri) os.system( "raxmlHPC -m ASC_GTRGAMMA --asc-corr=lewis -s snpma.fasta -p 1 -n val" ) # os.system("raxmlHPC -m GTRGAMMA -s snpma.fasta -p 1 -n val_noasc") os.chdir(cwd) print diri trestr = open("{}/RAxML_bestTree.val".format(diri)).readline().replace( 'sim_', '') # trestr =open("{}/RAxML_bestTree.val_noasc".format(diri)).readline().replace('sim_','') tns = dendropy.TaxonNamespace() inputtree = dendropy.Tree.get_from_path( "{}/scaledtree.tre".format(diri), schema="newick", taxon_namespace=tns) inferred = dendropy.Tree.get_from_string(trestr, schema="newick", taxon_namespace=tns) inputtree.encode_bipartitions() inferred.encode_bipartitions() inf_dict['Euclidean'].append( treecompare.euclidean_distance(inputtree, inferred)) inf_dict['RF'].append( treecompare.unweighted_robinson_foulds_distance( inputtree, inferred)) freqs = subprocess.check_output( ["grep", "Base frequencies:", "{}/RAxML_info.val".format(diri)]).split() # freqs = subprocess.check_output(["grep", "Base frequencies:","{}/RAxML_info.val_noasc".format(diri)]).split() print(freqs) freqs = [float(val) for val in freqs[2:]] inf_dict['freqA'].append(freqs[0]) inf_dict['freqC'].append(freqs[1]) inf_dict['freqG'].append(freqs[2]) inf_dict['freqT'].append(freqs[3]) trans = subprocess.check_output( ["grep", "ac ag at cg ct gt", "{}/RAxML_info.val".format(diri)]).split() # trans = subprocess.check_output(["grep", "ac ag at cg ct gt", "{}/RAxML_info.val_noasc".format(diri)]).split() trans = [float(val) for val in trans[9:]] inf_dict['ac'].append(trans[0]) inf_dict['ag'].append(trans[1]) inf_dict['at'].append(trans[2]) inf_dict['cg'].append(trans[3]) inf_dict['ct'].append(trans[4]) inf_dict['gt'].append(trans[5]) inf_dict['MutsCalled'].append( float( subprocess.check_output(["wc", "{}/snplist.txt".format(diri) ]).split()[0])) inf_dict['MutsSim'].append( float( subprocess.check_output(["wc", "{}/mutsites.txt".format(diri) ]).split()[0])) for key in inf_dict: mean = sum(inf_dict[key]) / len(inf_dict[key]) print(key) print(mean) return (inf_dict)