Esempio n. 1
0
    def calculate_robinson_foulds(self, species_tree, gene_tree, weighted):
        """
        Calculates the Robinson Foulds distances for weighted and unweighted
        trees.

        Input:
        species_tree -- newick file or newick string containing the species tree
        gene_tree   -- newick file or newick string containing the tree to
                          be compared to the species tree
        weighted       -- boolean parameter for whether the files have weights

        Returns:
        The weighted and/or unweighted Robinson Foulds distance of the species
        tree and input tree.
        """

        # taxon names
        tns = dendropy.TaxonNamespace()

        # Create dendropy tree from species tree input file
        if os.path.isfile(species_tree):
            species_tree = Tree.get_from_path(species_tree,
                                              'newick',
                                              taxon_namespace=tns)

        # Create dendropy tree from species tree input newick string
        else:
            species_tree = Tree.get_from_string(species_tree,
                                                'newick',
                                                taxon_namespace=tns)

        # Create dendropy tree from gene tree input file
        if os.path.isfile(gene_tree):
            gene_tree = Tree.get_from_path(gene_tree,
                                           'newick',
                                           taxon_namespace=tns)

        # Create dendropy tree from gene tree input newick string
        else:
            gene_tree = Tree.get_from_string(gene_tree,
                                             'newick',
                                             taxon_namespace=tns)

        # both weighted and unweighted foulds distance
        if weighted:
            return treecompare.weighted_robinson_foulds_distance(species_tree, gene_tree), \
                   treecompare.unweighted_robinson_foulds_distance(species_tree, gene_tree)

        # only unweighted foulds distance
        else:
            return treecompare.unweighted_robinson_foulds_distance(
                species_tree, gene_tree)
Esempio n. 2
0
def collapse_short_analyses(dirstub, num):
    inf_dict = {'Euclidean': [], 'RF': []}
    for i in range(num):
        i += 1
        diri = "{}{}".format(dirstub, i)
        tns = dendropy.TaxonNamespace()
        inputtree = dendropy.Tree.get_from_path(
            "{}/scaledtree.tre".format(diri),
            schema="newick",
            taxon_namespace=tns)
        for edge in inputtree.postorder_edge_iter():
            if edge.length < 0.0000000001:
                edge.collapse()
        inferred = dendropy.Tree.get_from_string(trestr,
                                                 schema="newick",
                                                 taxon_namespace=tns)
        for edge in inferredtree.postorder_edge_iter():
            if edge.length < 0.0000000001:
                edge.collapse()
        inputtree.encode_bipartitions()
        inferred.encode_bipartitions()
        inf_dict['Euclidean'].append(
            treecompare.euclidean_distance(inputtree, inferred))
        inf_dict['RF'].append(
            treecompare.unweighted_robinson_foulds_distance(
                inputtree, inferred))
    for key in inf_dict:
        mean = sum(inf_dict[key]) / len(inf_dict[key])
        print(key)
        print(mean)
    return (inf_dict)


#perform_sims(dirstub = "validation/short_fix/run", refloc = "example/short_ref.fasta")
#perform_analyses("validation/short_fix/run")
Esempio n. 3
0
def compute_distance(trees, true_tree):
    """
    Computes Robinson-Foulds distance between input trees and "true" tree
    :param trees: dict of dendropy tree top be compared to the "true" tree
    :param true_tree: dentropy tree of the "true" tree
    :return: key:value dict where key is filename of tree
    """
    distance_dict = {
        file: tc.unweighted_robinson_foulds_distance(tree, true_tree)
        for file, tree in trees.items()
    }
    return distance_dict
Esempio n. 4
0
def calc_rfd_distribution(src_path):
    tns = dendropy.TaxonNamespace()
    trees = dendropy.TreeList.get(
            path=src_path,
            schema="nexus")
    rf_dists = []
    for idx1, t1 in enumerate(trees[:-1]):
        for idx2, t2 in enumerate(trees[idx1+1:]):
            rfd = treecompare.unweighted_robinson_foulds_distance(t1, t2)
            rf_dists.append(rfd)
    mean, var = statistics.mean_and_sample_variance(rf_dists)
    print("mean = {}, var = {}, 5/95% quantile = {}".format(
        mean,
        var,
        statistics.quantile_5_95(rf_dists)))
    def test_special_case1(self):
        original_tree_str = """\
        [&R] ((((e1:4.25978504749,a0:4.25978504749):9.75100657322,(e5:11.2557415909,c9:11.2557415909):2.75505002977):5.25672273638,(c5:17.0225375511,e6:17.0225375511):2.24497680601):20.9755404109,(((c7:0.0433876754663,e4:0.0433876754663):16.2031718648,(b1:14.1628944123,d7:14.1628944123):2.08366512802):14.3825543479,((((d1:13.4235384066,(d4:7.64533761739,c3:7.64533761739):5.77820078917):2.00948796838,((d8:3.10025757397,b5:3.10025757397):5.07496414931,a4:8.17522172328):7.25780465166):4.52823355379,((((((a7:8.94718577977,(((a1:2.04048640276,c2:2.04048640276):1.45629935083,(e0:0.408302025932,b6:0.408302025932):3.08848372766):3.77714533326,(((c6:2.1238494561,(e8:2.03255428077,d6:2.03255428077):0.0912951753249):2.91822700988,a5:5.04207646598):1.92173681425,((a2:3.43218264885,(b8:0.515232535857,a9:0.515232535857):2.91695011299):1.6832785054,b4:5.11546115425):1.84835212598):0.310117806629):1.67325469292):0.613875266884,(d9:8.93428444448,(c1:5.91732320427,c8:5.91732320427):3.0169612402):0.626776602178):3.65721021136,((c0:3.99662328128,d2:3.99662328128):1.90572648225,(e9:1.84550535315,(b9:0.803660457957,e3:0.803660457957):1.0418448952):4.05684441038):7.31592149449):1.63573163655,d5:14.8540028946):2.25255068893,d3:17.1065535835):2.27795405888,(((a3:4.85356559967,(c4:3.08209866724,d0:3.08209866724):1.77146693244):2.74425153816,e7:7.59781713783):4.22596432824,(b2:2.86856170856,e2:2.86856170856):8.9552197575):7.56072617631):0.576752286338):1.05878660087,((b0:1.6464852541,b7:1.6464852541):10.0630186678,(a8:7.31781944487,(a6:7.13495568605,b3:7.13495568605):0.182863758824):4.39168447703):9.31054260769):9.60906735859):9.61394087983):6.65318140005;
        """
        expected_tree_strs = """\
        [&R] (a0:40.243054768,((a4:19.9612599287,((a7:8.94718577977,(a1:7.27393108685,(a5:6.96381328023,(a2:3.43218264885,a9:3.43218264885):3.53163063138):0.310117806629):1.67325469292):10.4373218626,a3:19.3845076424):0.576752286338):1.05878660087,(a8:7.31781944487,a6:7.31781944487):13.7022270847):19.2230082384):6.65318140005;
        [&R] (b1:30.6291138882,((b5:19.9612599287,(((b6:7.27393108685,(b8:5.11546115425,b4:5.11546115425):2.15846993261):5.94434017116,b9:13.218271258):6.16623638436,b2:19.3845076424):0.576752286338):1.05878660087,((b0:1.6464852541,b7:1.6464852541):10.0630186678,b3:11.7095039219):9.31054260769):9.60906735859):16.2671222799;
        [&R] ((c9:19.2675143571,c5:19.2675143571):20.9755404109,(c7:30.6291138882,(c3:19.9612599287,((((c2:7.27393108685,c6:7.27393108685):2.2871299598,(c1:5.91732320427,c8:5.91732320427):3.64373784238):3.65721021136,c0:13.218271258):6.16623638436,c4:19.3845076424):0.576752286338):10.6678539595):9.61394087983):6.65318140005;
        [&R] (d7:30.6291138882,(((d1:13.4235384066,d4:13.4235384066):2.00948796838,d8:15.4330263749):4.52823355379,(((((d6:9.56106104665,d9:9.56106104665):3.65721021136,d2:13.218271258):1.63573163655,d5:14.8540028946):2.25255068893,d3:17.1065535835):2.27795405888,d0:19.3845076424):0.576752286338):10.6678539595):16.2671222799;
        [&R] (((e1:14.0107916207,e5:14.0107916207):5.25672273638,e6:19.2675143571):20.9755404109,(e4:30.6291138882,(((e0:7.27393108685,e8:7.27393108685):5.94434017116,(e9:1.84550535315,e3:1.84550535315):11.3727659049):6.16623638436,(e7:11.8237814661,e2:11.8237814661):7.56072617631):11.2446062458):9.61394087983):6.65318140005;
        """

        tns = dendropy.TaxonNamespace()
        source_tree1 = dendropy.Tree.get(
                data=original_tree_str,
                schema="newick",
                taxon_namespace=tns)
        source_tree2 = dendropy.Tree.get(
                data=original_tree_str,
                schema="newick",
                taxon_namespace=tns)
        self.assertEqual(treecompare.weighted_robinson_foulds_distance(source_tree1, source_tree2), 0.0)
        group_ids = ("a", "b", "c", "d", "e")
        expected_induced_trees = dendropy.TreeList.get(
                data=expected_tree_strs,
                schema="newick",
                taxon_namespace=tns)
        assert len(expected_induced_trees) == len(group_ids)
        for group_id, expected_induced_tree in zip(group_ids, expected_induced_trees):
            extracted_tree = source_tree1.extract_tree(
                    node_filter_fn=lambda node: node.taxon.label.startswith(group_id),
                    is_apply_filter_to_leaf_nodes=True,
                    is_apply_filter_to_internal_nodes=False)
            for leaf_nd in extracted_tree.leaf_node_iter():
                self.assertTrue(leaf_nd.taxon.label.startswith(group_id))
            for leaf_nd in expected_induced_tree.leaf_node_iter():
                assert leaf_nd.taxon.label.startswith(group_id)

            # self.assertEqual(treecompare.weighted_robinson_foulds_distance(source_tree1, source_tree2), 0.0)
            self.assertEqual(treecompare.unweighted_robinson_foulds_distance(extracted_tree, expected_induced_tree), 0)
            self.assertAlmostEqual(treecompare.weighted_robinson_foulds_distance(extracted_tree, expected_induced_tree), 0.0)
 def test_special_case2(self):
     original_tree_str = """\
     [&R] ((a1,(a2,(a3,a4)a0)),(b1,(b2,(b3,b4))));
     """
     source_tree1 = dendropy.Tree.get(
             data=original_tree_str,
             schema="newick",
             )
     expected_tree_str = """\
     [&R] ((a3,a4)a0);
     """
     extracted_tree = source_tree1.extract_tree(
             # node_filter_fn=lambda node: node.taxon is not None and node.taxon.label.startswith("a"),
             node_filter_fn=lambda node: node.taxon.label in set(["a3", "a4"]),
             is_apply_filter_to_leaf_nodes=True,
             is_apply_filter_to_internal_nodes=False,
             )
     expected_tree = dendropy.Tree.get(
             data=expected_tree_str,
             schema="newick",
             taxon_namespace=source_tree1.taxon_namespace)
     self.assertEqual(treecompare.unweighted_robinson_foulds_distance(extracted_tree, expected_tree), 0.0)
import dendropy
from dendropy.calculate import treecompare
from dendropy import Tree
import os

protein_dir_set = []
for i in os.listdir('output'):
    if "nex" in i.split("."):
        protein_dir_set.append(str(i))

tns = dendropy.TaxonNamespace()
for i in range(0, len(protein_dir_set)):
    for j in range(i + 1, len(protein_dir_set)):
        tree1 = Tree.get_from_path("output/" + protein_dir_set[i],
                                   "nexus",
                                   taxon_namespace=tns)
        tree2 = Tree.get_from_path("output/" + protein_dir_set[j],
                                   "nexus",
                                   taxon_namespace=tns)

        tree1.encode_bipartitions()
        tree2.encode_bipartitions()
        print(protein_dir_set[i], protein_dir_set[j],
              treecompare.unweighted_robinson_foulds_distance(tree1, tree2))
Esempio n. 8
0
    def topology_counter(self, rooted=False, outgroup=None):
        """
        Counts the number of times that each topology appears as outputted by
        running RAxML.

        Output:
        topologies_to_counts --- a dictionary mapping topologies to the number of times they appear
        """

        # Initialize a dictionary mapping newick strings to unique topologies
        unique_topologies_to_newicks = {}

        # taxon names
        tns = dendropy.TaxonNamespace()

        # Create a set of unique topologies
        unique_topologies = set([])

        # Get the topology files from the "Topologies" folder
        input_directory = "Topologies"

        # Initialize topology_count to a defaultdict
        topologies_to_counts = defaultdict(int)

        # Iterate over each file in the given directory
        for filename in os.listdir(input_directory):

            # Create a boolean flag for determining the uniqueness of tree
            new_tree_is_unique = True

            # If file is the file with the best tree newick string
            if os.path.splitext(filename)[0] == "Topology_bestTree":
                input_file = os.path.join(input_directory, filename)

                new_tree = Tree.get_from_path(input_file,
                                              'newick',
                                              taxon_namespace=tns)

                if rooted:
                    outgroup_node = new_tree.find_node_with_taxon_label(
                        outgroup)
                    new_tree.to_outgroup_position(outgroup_node,
                                                  update_bipartitions=False)

                # Iterate over each topology in unique_topologies
                for unique_topology in unique_topologies:

                    # Create a tree for each of the unique topologies calculate RF distance compared to new_tree
                    unique_tree = Tree.get_from_string(unique_topology,
                                                       'newick',
                                                       taxon_namespace=tns)
                    rf_distance = treecompare.unweighted_robinson_foulds_distance(
                        unique_tree, new_tree)

                    # If the RF distance is 0 then the new tree is the same as one of the unique topologies
                    if rf_distance == 0:
                        topologies_to_counts[unique_topology] += 1
                        new_tree_is_unique = False
                        new_tree = new_tree.as_string("newick").replace(
                            "\n", "")
                        unique_topologies_to_newicks[unique_topology].add(
                            new_tree)
                        break

                # If the new tree is a unique tree add it to the set of unique topologies
                if new_tree_is_unique:
                    new_tree = new_tree.as_string("newick").replace("\n", "")
                    unique_topologies.add(new_tree)
                    topologies_to_counts[new_tree] += 1
                    unique_topologies_to_newicks[new_tree] = set([new_tree])

        return topologies_to_counts, unique_topologies_to_newicks
Esempio n. 9
0
 def assert_equal_trees(self, t0, t1):
     self.assertEqual(
         treecompare.unweighted_robinson_foulds_distance(t0, t1), 0)
     self.assertAlmostEqual(
         treecompare.weighted_robinson_foulds_distance(t0, t1), 0, 8)
 def assert_equal_trees(self, t0, t1):
     self.assertEqual(treecompare.unweighted_robinson_foulds_distance(t0, t1), 0)
     self.assertAlmostEqual(treecompare.weighted_robinson_foulds_distance(t0, t1), 0, 8)
Esempio n. 11
0
def get_figure(software1, software2, output_path):
    """
    This function generates a comparison figure of trees generated from software 1 and software 2
    :param software1: name of the software
    :param software2: name of the software
    :param output_path: path to where the output figure will be saved
    :return: NA
    """
    map = {
        1: "PB2",
        2: "PB1",
        3: "PA",
        4: "HA",
        5: "NP",
        6: "NA",
        7: "MP",
        8: "NS",
        9: "concatenated"
    }

    wRF = []
    uwRF = []
    eD = []

    for num1 in range(1, 10):
        w = []
        u = []
        e = []
        for num2 in range(1, 10):
            segment1 = map[num1]
            segment2 = map[num2]

            groundTruthFile = get_file(software1, segment1)
            estimationFile = get_file(software2, segment2)

            tns = dendropy.TaxonNamespace()
            gtTree = dendropy.Tree.get(file=open(groundTruthFile, 'r'),
                                       schema='newick',
                                       taxon_namespace=tns)
            estimateTree = dendropy.Tree.get(file=open(estimationFile, 'r'),
                                             schema='newick',
                                             taxon_namespace=tns)

            # metrics, weighted RF is unsymmetric, unweighted RF is symmetric distance
            weightedRF = treecompare.weighted_robinson_foulds_distance(
                gtTree, estimateTree)
            unweightedRF = treecompare.unweighted_robinson_foulds_distance(
                gtTree, estimateTree)
            euclideanDist = treecompare.euclidean_distance(
                gtTree, estimateTree)
            w.append(weightedRF)
            u.append(unweightedRF)
            e.append(euclideanDist)

        wRF.append(w)
        uwRF.append(u)
        eD.append(e)

    wRF = np.array(wRF)
    uwRF = np.array(uwRF)
    eD = np.array(eD)

    metric_map = {
        "Weighted Robinson Foulds": wRF,
        "Unweighted Robinson Foulds": uwRF,
        "Euclidean Distances": eD
    }
    for metric in [
            "Weighted Robinson Foulds", "Unweighted Robinson Foulds",
            "Euclidean Distances"
    ]:
        fig, ax = plt.subplots()

        im, cbar = heatmap(metric_map[metric],
                           software1,
                           software2,
                           ax=ax,
                           cmap="YlGn",
                           cbarlabel="Distance")

        texts = annotate_heatmap(im, valfmt="{x:.2f}")

        title = "%s on %s and %s Tree" % (metric, software1.capitalize(),
                                          software2.capitalize())
        ax.set_title(title, pad=-330)

        fig.tight_layout()

        # save figure to output path
        plt.savefig(output_path)
import dendropy
from dendropy.calculate.treecompare import \
 unweighted_robinson_foulds_distance, euclidean_distance, false_positives_and_negatives

if __name__ == "__main__":
    usage = "Compute Robinson-Foulds distance between two trees in NEWICK format.\n\
		Usage: python bin/compute_tree_dist.sh [First tree in newick format] [Second tree in newick format]\n\
		Example: python bin/compute_tree_dist.sh 1.tree 2.tree"

    if len(sys.argv) != 3 or sys.argv[1] == "-h" or sys.argv[1] == "--help":
        print(usage, file=sys.stderr)
        sys.exit(1)

    # Parse arguments
    tree_files = [os.path.abspath(x) for x in sys.argv[1:]]

    # Read trees
    tns = dendropy.TaxonNamespace()
    trees = tuple([
        dendropy.Tree.get(path=x, schema="newick", taxon_namespace=tns)
        for x in tree_files
    ])

    print(unweighted_robinson_foulds_distance(*trees))
    #print(euclidean_distance(*trees))
    # d = false_positives_and_negatives(*trees)
    # print(np.sqrt(np.float(d[0])*np.float(d[1])))

    sys.exit(0)
Esempio n. 13
0
def perform_analyses(dirstub, num):
    inf_dict = {
        'Euclidean': [],
        'RF': [],
        'MutsSim': [],
        'MutsCalled': [],
        'freqA': [],
        'freqC': [],
        'freqG': [],
        'freqT': [],
        'ac': [],
        'ag': [],
        'at': [],
        'cg': [],
        'ct': [],
        'gt': []
    }
    for i in range(num):
        i += 1
        diri = "{}{}".format(dirstub, i)
        #diri = "{}{}/altref".format(dirstub,i)
        cwd = os.getcwd()
        os.chdir(diri)
        os.system(
            "raxmlHPC -m ASC_GTRGAMMA --asc-corr=lewis -s snpma.fasta -p 1 -n val"
        )
        #        os.system("raxmlHPC -m GTRGAMMA -s snpma.fasta -p 1 -n val_noasc")
        os.chdir(cwd)
        print diri
        trestr = open("{}/RAxML_bestTree.val".format(diri)).readline().replace(
            'sim_', '')
        #        trestr =open("{}/RAxML_bestTree.val_noasc".format(diri)).readline().replace('sim_','')
        tns = dendropy.TaxonNamespace()
        inputtree = dendropy.Tree.get_from_path(
            "{}/scaledtree.tre".format(diri),
            schema="newick",
            taxon_namespace=tns)
        inferred = dendropy.Tree.get_from_string(trestr,
                                                 schema="newick",
                                                 taxon_namespace=tns)
        inputtree.encode_bipartitions()
        inferred.encode_bipartitions()
        inf_dict['Euclidean'].append(
            treecompare.euclidean_distance(inputtree, inferred))
        inf_dict['RF'].append(
            treecompare.unweighted_robinson_foulds_distance(
                inputtree, inferred))
        freqs = subprocess.check_output(
            ["grep", "Base frequencies:",
             "{}/RAxML_info.val".format(diri)]).split()
        #        freqs = subprocess.check_output(["grep", "Base frequencies:","{}/RAxML_info.val_noasc".format(diri)]).split()
        print(freqs)
        freqs = [float(val) for val in freqs[2:]]
        inf_dict['freqA'].append(freqs[0])
        inf_dict['freqC'].append(freqs[1])
        inf_dict['freqG'].append(freqs[2])
        inf_dict['freqT'].append(freqs[3])
        trans = subprocess.check_output(
            ["grep", "ac ag at cg ct gt",
             "{}/RAxML_info.val".format(diri)]).split()
        #        trans = subprocess.check_output(["grep", "ac ag at cg ct gt",  "{}/RAxML_info.val_noasc".format(diri)]).split()
        trans = [float(val) for val in trans[9:]]
        inf_dict['ac'].append(trans[0])
        inf_dict['ag'].append(trans[1])
        inf_dict['at'].append(trans[2])
        inf_dict['cg'].append(trans[3])
        inf_dict['ct'].append(trans[4])
        inf_dict['gt'].append(trans[5])
        inf_dict['MutsCalled'].append(
            float(
                subprocess.check_output(["wc", "{}/snplist.txt".format(diri)
                                         ]).split()[0]))
        inf_dict['MutsSim'].append(
            float(
                subprocess.check_output(["wc", "{}/mutsites.txt".format(diri)
                                         ]).split()[0]))
    for key in inf_dict:
        mean = sum(inf_dict[key]) / len(inf_dict[key])
        print(key)
        print(mean)
    return (inf_dict)