Ejemplo n.º 1
0
def bootstrap_consensus_tree(corpus, trees=[], consensus_level=0.5):
    tmp_dir = mkdtemp()
    for idx, tree in enumerate(trees):
        t = tree.dendrogram.to_ete(labels=corpus.titles)
        t.write(outfile=tmp_dir + '/tree_' + str(idx) + '.newick')
    trees = []
    tns = dendropy.TaxonNamespace(corpus.titles, label="label")
    for filename in glob.glob(tmp_dir + '/*.newick'):
        tree = dendropy.Tree.get(path=filename,
                                 schema='newick',
                                 preserve_underscores=True,
                                 taxon_namespace=tns)
        trees.append(tree)

    tsum = TreeSummarizer(support_as_labels=True,
                          support_as_edge_lengths=False,
                          support_as_percentages=True,
                          add_node_metadata=True,
                          weighted_splits=True)
    taxon_namespace = trees[0].taxon_namespace
    split_distribution = dendropy.SplitDistribution(
        taxon_namespace=taxon_namespace)
    tsum.count_splits_on_trees(trees,
                               split_distribution=split_distribution,
                               is_bipartitions_updated=False)
    tree = tsum.tree_from_splits(
        split_distribution,
        min_freq=consensus_level,
        rooted=False,
        include_edge_lengths=False)  # this param is crucial
    ete_tree = EteTree(tree.as_string("newick").replace('[&U] ', '') + ';')
    return ete_tree
Ejemplo n.º 2
0
    def get_split_distribution_from_files(self,
                                          tree_filepaths=None,
                                          is_rooted=None,
                                          use_tree_weights=None,
                                          burnin=None,
                                          taxa_definition_filepath=None,
                                          taxon_namespace=None,
                                          split_distribution=None):
        """
        Returns a SplitDistribution object based on splits given in
        tree files.

        tree_filepaths : iterable of strings
            A list or some other iterable of file paths containing trees in
            NEXUS format.
        is_rooted : bool
            If |True| then trees will be treated as rooted. If |False|, then
            rooting follows that specified in the tree statements, defaulting
            to unrooted if not specified.
        use_tree_weights : bool
            If |False| then tree weighting statements are disregarded.
            Otherwise, they will be regarded.
        burnin : integer
            Skip these many trees (from beginning of each source).
        taxa_definition_filepath : str
            Path of file containing TAXA block to execute. This is crucial to
            getting the taxon order (and hence, indexes, and hence, split
            bitmasks) correct. If not given, will use the first file
            given in ``tree_filepaths``.
        taxon_namespace : |TaxonNamespace|
            |TaxonNamespace| object to use.
        split_distribution : `SplitDistribution`
            `SplitDistribution object to use.
        """
        if split_distribution is None:
            split_distribution = dendropy.SplitDistribution(
                taxon_namespace=taxon_namespace)
            taxon_namespace = split_distribution.taxon_namespace
        else:
            if taxon_namespace is None:
                taxon_namespace = split_distribution.taxon_namespace
            else:
                assert split_distribution.taxon_namespace is taxon_namespace
        result = self.count_splits_from_files(
            tree_filepaths=tree_filepaths,
            is_rooted=is_rooted,
            use_tree_weights=use_tree_weights,
            burnin=burnin,
            taxa_definition_filepath=taxa_definition_filepath,
            taxon_namespace=taxon_namespace)
        for split in result["bipartition_counts"]:
            if not is_rooted:
                sd_split_key = split_distribution.normalize_bitmask(split)
            else:
                sd_split_key = split
            split_distribution.add_split_count(
                sd_split_key, result["bipartition_counts"][split])
        split_distribution.total_trees_counted = result["num_trees"]
        return split_distribution
Ejemplo n.º 3
0
 def setUp(self):
     self.trees = dendropy.TreeList.get_from_path(
             pathmap.tree_source_path("issue_mth_2009-02-03.rooted.nexus"),
             "nexus")
     self.split_distribution = dendropy.SplitDistribution(taxon_namespace=self.trees.taxon_namespace)
     for tree in self.trees:
         self.split_distribution.count_splits_on_tree(
                 tree,
                 is_bipartitions_updated=False)
Ejemplo n.º 4
0
 def check_split_counting(
     self,
     tree_filename,
     test_as_rooted,
     parser_rooting_interpretation,
     test_ignore_tree_weights=False,
     dp_ignore_tree_weights=False,
 ):
     tree_filepath = pathmap.tree_source_path(tree_filename)
     ps = paup.PaupService()
     paup_sd = ps.get_split_distribution_from_files(
         tree_filepaths=[tree_filepath],
         is_rooted=test_as_rooted,
         use_tree_weights=not test_ignore_tree_weights,
         burnin=0,
         taxa_definition_filepath=tree_filepath)
     taxon_namespace = paup_sd.taxon_namespace
     dp_sd = dendropy.SplitDistribution(taxon_namespace=taxon_namespace)
     dp_sd.ignore_edge_lengths = True
     dp_sd.ignore_node_ages = True
     dp_sd.ignore_tree_weights = dp_ignore_tree_weights
     taxa_mask = taxon_namespace.all_taxa_bitmask()
     taxon_namespace.is_mutable = False
     trees = dendropy.TreeList.get_from_path(
         tree_filepath,
         "nexus",
         rooting=parser_rooting_interpretation,
         taxon_namespace=taxon_namespace)
     for tree in trees:
         self.assertIs(tree.taxon_namespace, taxon_namespace)
         self.assertIs(tree.taxon_namespace, dp_sd.taxon_namespace)
         dp_sd.count_splits_on_tree(tree, is_bipartitions_updated=False)
     self.assertEqual(dp_sd.total_trees_counted,
                      paup_sd.total_trees_counted)
     taxa_mask = taxon_namespace.all_taxa_bitmask()
     for split in dp_sd.split_counts:
         if not dendropy.Bipartition.is_trivial_bitmask(
                 split, taxa_mask):
             # if split not in paup_sd.split_counts:
             #     print("{}: {}".format(split, split in paup_sd.split_counts))
             #     s2 = taxon_namespace.normalize_bitmask(split)
             #     print("{}: {}".format(s2, s2 in paup_sd.split_counts))
             #     s3 = ~split & taxon_namespace.all_taxa_bitmask()
             #     print("{}: {}".format(s3, s3 in paup_sd.split_counts))
             self.assertIn(split, paup_sd.split_counts,
                           "split not found")
             self.assertEqual(dp_sd.split_counts[split],
                              paup_sd.split_counts[split],
                              "incorrect split frequency")
             del paup_sd.split_counts[split]
     remaining_splits = list(paup_sd.split_counts.keys())
     for split in remaining_splits:
         if dendropy.Bipartition.is_trivial_bitmask(split, taxa_mask):
             del paup_sd.split_counts[split]
     self.assertEqual(len(paup_sd.split_counts), 0)
Ejemplo n.º 5
0
 def consensus_tree(self,
                    trees,
                    min_freq=0.5,
                    is_bipartitions_updated=False):
     """
     Returns a consensus tree of all trees in ``trees``, with minumum frequency
     of split to be added to the consensus tree given by ``min_freq``.
     """
     taxon_namespace = trees[0].taxon_namespace
     split_distribution = dendropy.SplitDistribution(
         taxon_namespace=taxon_namespace)
     self.count_splits_on_trees(
         trees,
         split_distribution=split_distribution,
         is_bipartitions_updated=is_bipartitions_updated)
     tree = self.tree_from_splits(split_distribution, min_freq=min_freq)
     return tree
Ejemplo n.º 6
0
 def count_splits_on_trees(self,
                           tree_iterator,
                           split_distribution=None,
                           is_bipartitions_updated=False):
     """
     Given a list of trees file, a SplitsDistribution object (a new one, or,
     if passed as an argument) is returned collating the split data in the files.
     """
     if split_distribution is None:
         split_distribution = dendropy.SplitDistribution()
     taxon_namespace = split_distribution.taxon_namespace
     for tree_idx, tree in enumerate(tree_iterator):
         if taxon_namespace is None:
             assert (split_distribution.taxon_namespace is None)
             split_distribution.taxon_namespace = tree.taxon_namespace
             taxon_namespace = tree.taxon_namespace
         else:
             assert (taxon_namespace is tree.taxon_namespace)
         split_distribution.count_splits_on_tree(
             tree, is_bipartitions_updated=is_bipartitions_updated)
     return split_distribution
Ejemplo n.º 7
0
def consensus(outdir, min_freq=0.5, is_rooted=True,
              trees_splits_encoded=False):
    """Generate a rooted consensus tree"""
    # first ensure that all trees in the distribution have same number
    # of taxa, otherwise, make it so by dropping taxa not present in
    # all trees
    all_tip_names = []
    # read in from distribution.tre
    phylogenies = []
    phyloparse = Phylo.parse(os.path.join(outdir, 'distribution.tre'), 'newick')
    for p in phyloparse:
        phylogenies.append(p)
    for phylogeny in phylogenies:
        terminals = phylogeny.get_terminals()
        all_tip_names.append([e.name for e in terminals])
    counted = Counter(sum(all_tip_names, []))
    to_drop = [e for e in counted.keys() if counted[e] < len(phylogenies)]
    if (len(counted.keys()) - len(to_drop)) < 3:
        return False
    for tip_names, phylogeny in zip(all_tip_names, phylogenies):
        dropping = [e for e in tip_names if e in to_drop]
        for tip_name in dropping:
            phylogeny.prune(tip_name)
    with open('.for_consensus.tre', "w") as file:
        Phylo.write(phylogenies, file, 'newick')
    # create dendropy list
    trees = dp.TreeList()
    trees.read_from_path('.for_consensus.tre', "newick", rooting='force-rooted')
    os.remove('.for_consensus.tre')
    # https://groups.google.com/forum/#!topic/dendropy-users/iJ32ibnS5Bc
    sd = dp.SplitDistribution(taxon_namespace=trees.taxon_namespace)
    #sd.is_rooted = is_rooted
    tsum = dp.calculate.treesum.TreeSummarizer()
    tsum.count_splits_on_trees(trees, split_distribution=sd)
    consensus = tsum.tree_from_splits(sd, min_freq=min_freq)
    consensus.write_to_path(os.path.join(outdir, 'consensus.tre'), "newick")
    return True
Ejemplo n.º 8
0
    def check_splits_distribution(self,
            tree_filename,
            splits_filename,
            use_tree_weights,
            is_rooted,
            expected_num_trees,
            ):
        if is_rooted is None:
            key_column_index = 2 # default to unrooted: normalized split bitmask
        elif is_rooted:
            key_column_index = 1 # leafset_bitmask / unnormalized split bitmask
        else:
            key_column_index = 2 # normalized split bitmask
        splits_ref = paupsplitsreference.get_splits_reference(
                splits_filename=splits_filename,
                key_column_index=key_column_index,
                )
        # print("* {} ({})".format(tree_filename, splits_filename))
        tree_filepath = pathmap.tree_source_path(tree_filename)
        trees = dendropy.TreeList.get_from_path(
                tree_filepath,
                "nexus",
                store_tree_weights=use_tree_weights)
        sd = dendropy.SplitDistribution(
                taxon_namespace=trees.taxon_namespace,
                use_tree_weights=use_tree_weights)
        for tree in trees:
            sd.count_splits_on_tree(tree)

        # trees counted ...
        self.assertEqual(sd.total_trees_counted, len(trees))
        # frequencies have not yet been calculated
        self.assertEqual(sd._trees_counted_for_freqs, 0)
        self.assertFalse(sd.is_mixed_rootings_counted())
        if is_rooted:
            self.assertTrue(sd.is_all_counted_trees_rooted())
        else:
            self.assertFalse(sd.is_all_counted_trees_rooted())
            self.assertTrue(sd.is_all_counted_trees_treated_as_unrooted() or sd.is_all_counted_trees_strictly_unrooted())

        # splits_distribution also counts trivial splits, so this will not work
        # self.assertEqual(len(splits_ref), len(sd))

        expected_nontrivial_splits = list(splits_ref.keys())
        observed_splits = set(sd.split_counts.keys())
        visited_splits = []
        # for k in sorted(observed_splits):
        #     print("{}: {}, {}".format(k, sd.split_counts[k], sd[k]))
        all_taxa_bitmask = sd.taxon_namespace.all_taxa_bitmask()
        for split in expected_nontrivial_splits:
            self.assertAlmostEqual(sd.split_counts[split], splits_ref[split]["count"], 2,
                    "{} (using '{}'): {}".format(tree_filename, splits_filename, split))
            self.assertAlmostEqual(sd[split], splits_ref[split]["frequency"], 2,
                    "{} (using '{}'): {}".format(tree_filename, splits_filename, split))
            self.assertAlmostEqual(sd.split_frequencies[split], splits_ref[split]["frequency"], 2,
                    "{} (using '{}'): {}".format(tree_filename, splits_filename, split))
            if split in observed_splits:
                observed_splits.remove(split)
            visited_splits.append(split)
        self.assertEqual(len(visited_splits), len(expected_nontrivial_splits))

        # ensure remaining splits (not given in PAUP splits file) are trivial ones (which are not tracked by PAUP)
        for split in observed_splits:
            self.assertTrue(dendropy.Bipartition.is_trivial_bitmask(split, all_taxa_bitmask))