Beispiel #1
0
    def summarize_edge_lengths_on_tree(self,
            tree,
            split_distribution,
            summarization_fn=None,
            is_bipartitions_updated=False):
        """
        Sets the lengths of edges on ``tree`` (a |Tree| object) to the mean
        lengths of the corresponding edges on the input trees (in
        ``split_distribution``, a `SplitDistribution` object) being
        summarized.
        ``summarization_fn`` should take an iterable of floats, and return a float. If |None|, it
        defaults to calculating the mean (``lambda x: float(sum(x))/len(x)``).
        """
        if summarization_fn is None:
            summarization_fn = lambda x: float(sum(x))/len(x)
        if not is_bipartitions_updated:
            tree.encode_bipartitions()
        for edge in tree.postorder_edge_iter():
            split = edge.bipartition.split_bitmask
            if (split in split_distribution.split_edge_lengths
                    and split_distribution.split_edge_lengths[split]):
                lengths = split_distribution.split_edge_lengths[split]
                edge.length = summarization_fn(lengths)
            elif (split in split_distribution.split_edge_lengths
                    and not split_distribution.split_edge_lengths[split]):
                # no input trees had any edge lengths for this split
                edge.length = None
            else:
                # split on target tree that was not found in any of the input
                # trees
                edge.length = 0.0
        return tree

        ## here we add the support values and/or edge lengths for the terminal taxa ##
        for node in leaves:
            if not is_rooted:
                split = node.edge.split_bitmask
            else:
                split = node.edge.leafset_bitmask
            self.map_split_support_to_node(node, 1.0)
            if include_edge_lengths:
                elen = split_distribution.split_edge_lengths.get(split, [0.0])
                if len(elen) > 0:
                    mean, var = mean_and_sample_variance(elen)
                    node.edge.length = mean
                    if include_edge_length_var:
                        node.edge.length_var = var
                else:
                    node.edge.length = None
                    if include_edge_length_var:
                        node.edge.length_var = None
        #if include_edge_lengths:
            #self.map_edge_lengths_to_tree(tree=con_tree,
            #        split_distribution=split_distribution,
            #        summarization_fn=summarization_fn,
            #        include_edge_length_var=False)
        return con_tree
Beispiel #2
0
    def summarize_edge_lengths_on_tree(self,
                                       tree,
                                       split_distribution,
                                       summarization_fn=None,
                                       is_bipartitions_updated=False):
        """
        Sets the lengths of edges on ``tree`` (a |Tree| object) to the mean
        lengths of the corresponding edges on the input trees (in
        ``split_distribution``, a `SplitDistribution` object) being
        summarized.
        ``summarization_fn`` should take an iterable of floats, and return a float. If |None|, it
        defaults to calculating the mean (``lambda x: float(sum(x))/len(x)``).
        """
        if summarization_fn is None:
            summarization_fn = lambda x: float(sum(x)) / len(x)
        if not is_bipartitions_updated:
            tree.encode_bipartitions()
        for edge in tree.postorder_edge_iter():
            split = edge.bipartition.split_bitmask
            if (split in split_distribution.split_edge_lengths
                    and split_distribution.split_edge_lengths[split]):
                lengths = split_distribution.split_edge_lengths[split]
                edge.length = summarization_fn(lengths)
            elif (split in split_distribution.split_edge_lengths
                  and not split_distribution.split_edge_lengths[split]):
                # no input trees had any edge lengths for this split
                edge.length = None
            else:
                # split on target tree that was not found in any of the input
                # trees
                edge.length = 0.0
        return tree

        ## here we add the support values and/or edge lengths for the terminal taxa ##
        for node in leaves:
            if not is_rooted:
                split = node.edge.split_bitmask
            else:
                split = node.edge.leafset_bitmask
            self.map_split_support_to_node(node, 1.0)
            if include_edge_lengths:
                elen = split_distribution.split_edge_lengths.get(split, [0.0])
                if len(elen) > 0:
                    mean, var = mean_and_sample_variance(elen)
                    node.edge.length = mean
                    if include_edge_length_var:
                        node.edge.length_var = var
                else:
                    node.edge.length = None
                    if include_edge_length_var:
                        node.edge.length_var = None
        #if include_edge_lengths:
        #self.map_edge_lengths_to_tree(tree=con_tree,
        #        split_distribution=split_distribution,
        #        summarization_fn=summarization_fn,
        #        include_edge_length_var=False)
        return con_tree
Beispiel #3
0
 def _calculate_standardized_effect_size(self,
         statisticf_name,
         statisticf_kwargs=None,
         assemblage_memberships=None,
         null_model_type="taxa.label",
         num_randomization_replicates=1000,
         rng=None):
     result_type = collections.namedtuple("PhylogeneticCommunityStandardizedEffectSizeStatisticCalculationResult",
             ["obs", "null_model_mean", "null_model_sd", "z", "rank", "p",])
     if assemblage_memberships is None:
         assemblage_memberships = [ set(self._mapped_taxa) ]
     if statisticf_kwargs is None:
         statisticf_kwargs = {}
     observed_stat_values = {}
     null_model_stat_values = {}
     null_model_matrix = self.clone()
     assert null_model_matrix == self
     for rep_idx in range(num_randomization_replicates):
         null_model_matrix.shuffle_taxa(rng=rng)
         for community_idx, assemblage_membership in enumerate(assemblage_memberships):
             filter_fn = lambda taxon: taxon in assemblage_membership
             statisticf_kwargs["filter_fn"] = filter_fn
             if rep_idx == 0:
                 observed_stat_values[community_idx] = getattr(self, statisticf_name)(**statisticf_kwargs)
                 null_model_stat_values[community_idx] = []
             stat_value = getattr(null_model_matrix, statisticf_name)(**statisticf_kwargs)
             null_model_stat_values[community_idx].append(stat_value)
     results = []
     for community_idx, assemblage_membership in enumerate(assemblage_memberships):
         obs_value = observed_stat_values[community_idx]
         stat_values = null_model_stat_values[community_idx]
         null_model_mean, null_model_var = statistics.mean_and_sample_variance(stat_values)
         rank = statistics.rank(
                 value_to_be_ranked=obs_value,
                 value_providing_rank=stat_values)
         if null_model_var > 0:
             null_model_sd = math.sqrt(null_model_var)
             z = (obs_value - null_model_mean) / null_model_sd
         else:
             null_model_sd = 0.0
             z = None
         p = float(rank) / len(stat_values)
         result = result_type(
                 obs=obs_value,
                 null_model_mean=null_model_mean,
                 null_model_sd=null_model_sd,
                 z=z,
                 rank=rank,
                 p=p)
         results.append(result)
     return results
Beispiel #4
0
def calc_rfd_distribution(src_path):
    tns = dendropy.TaxonNamespace()
    trees = dendropy.TreeList.get(
            path=src_path,
            schema="nexus")
    rf_dists = []
    for idx1, t1 in enumerate(trees[:-1]):
        for idx2, t2 in enumerate(trees[idx1+1:]):
            rfd = treecompare.unweighted_robinson_foulds_distance(t1, t2)
            rf_dists.append(rfd)
    mean, var = statistics.mean_and_sample_variance(rf_dists)
    print("mean = {}, var = {}, 5/95% quantile = {}".format(
        mean,
        var,
        statistics.quantile_5_95(rf_dists)))
Beispiel #5
0
    def _calc_community_ecology_stats(self,
            phylogenetic_distance_matrix,
            assemblage_memberships,
            assemblage_descriptions,
            report_character_state_specific_results=True,
            report_character_class_wide_results=True,
            ):

        assert len(assemblage_descriptions) == len(assemblage_memberships)

        summary_statistics_suite = {}
        results_by_character_class = {}
        stat_scores_to_be_harvested = ("obs", "z", "p",) # z = score, p = p-value (turns out this is quite informative)
        for sstbh in stat_scores_to_be_harvested:
            results_by_character_class[sstbh] = collections.defaultdict(list)

        for edge_weighted_desc in ("unweighted", "weighted"):
            if edge_weighted_desc:
                is_weighted_edge_distances = True
            else:
                is_weighted_edge_distances = False
            for underlying_statistic_type_desc in ("mpd", "mntd"):
                if underlying_statistic_type_desc == "mpd":
                    stat_fn_name = "standardized_effect_size_mean_pairwise_distance"
                else:
                    stat_fn_name = "standardized_effect_size_mean_nearest_taxon_distance"
                stat_fn = getattr(phylogenetic_distance_matrix, stat_fn_name)
                results_group = stat_fn(
                    assemblage_memberships=assemblage_memberships,
                    is_weighted_edge_distances=is_weighted_edge_distances,
                    is_normalize_by_tree_size=True,
                    num_randomization_replicates=self.num_randomization_replicates,
                    )
                assert len(results_group) == len(assemblage_memberships)
                for result, assemblage_desc in zip(results_group, assemblage_descriptions):
                    for ses_result_statistic in stat_scores_to_be_harvested:
                        character_class_statistic_prefix = self.stat_name_delimiter.join([
                            self.stat_name_prefix,
                            "community",
                            "by",
                            assemblage_desc["assemblage_basis_class_id"],
                            ])
                        statistic_subtype_desc = self.stat_name_delimiter.join([
                            edge_weighted_desc,
                            underlying_statistic_type_desc,
                            # assemblage_desc["assemblage_basis_state_id"],
                            ])
                        character_class_statistic_key = tuple([character_class_statistic_prefix, statistic_subtype_desc])
                        ses_result_statistic_value = getattr(result, ses_result_statistic)
                        if ses_result_statistic_value is None:
                            continue
                        if report_character_state_specific_results:
                            character_state_statistic_name = self.stat_name_delimiter.join([
                                character_class_statistic_prefix,
                                assemblage_desc["assemblage_basis_state_id"],
                                statistic_subtype_desc,
                                ses_result_statistic,
                                ])
                            assert character_state_statistic_name not in summary_statistics_suite
                            summary_statistics_suite[character_state_statistic_name] = ses_result_statistic_value
                        if report_character_class_wide_results:
                            results_by_character_class[ses_result_statistic][character_class_statistic_key].append(ses_result_statistic_value)
        if report_character_class_wide_results:
            for ses_result_statistic in results_by_character_class:
                if len(results_by_character_class[ses_result_statistic]) == 0:
                    continue
                for key in results_by_character_class[ses_result_statistic]:
                    character_class_statistic_prefix, statistic_subtype_desc = key
                    svalues = results_by_character_class[ses_result_statistic][key]
                    mean_var = statistics.mean_and_sample_variance(svalues)
                    for s, sdesc in zip( mean_var, ("mean", "var"), ):
                        sn_title = self.stat_name_delimiter.join([
                            character_class_statistic_prefix,
                            sdesc,
                            statistic_subtype_desc,
                            ses_result_statistic,
                            ])
                        assert sn_title not in summary_statistics_suite
                        summary_statistics_suite[sn_title] = s
        return summary_statistics_suite
Beispiel #6
0
import collections
import dendropy
from dendropy.calculate import treemeasure
from dendropy.calculate import statistics

# Since we do not want to waste memory by keeping the actual trees around
# after we are done calculating the statistics, we use the tree yielder
# instead of:
#       dendropy.TreeList.get(
#           path="pythonidae.beast-mcmc.trees",
#           schema="nexus",
#           tree_offset=200)

tree_stats = collections.defaultdict(list)
for tree_idx, tree in enumerate(dendropy.Tree.yield_from_files(
            files=["pythonidae.beast-mcmc.trees"],
            schema="nexus")):
    if tree_idx < 200:
        continue # burnin
    tree_stats["B1"].append(treemeasure.B1(tree))
    tree_stats["colless"].append(treemeasure.colless_tree_imbalance(tree))
    tree_stats["PBH"].append(treemeasure.pybus_harvey_gamma(tree))
    tree_stats["sackin"].append(treemeasure.sackin_index(tree))
    tree_stats["treeness"].append(treemeasure.treeness(tree))

for key in tree_stats:
    values = tree_stats[key]
    mean, var = statistics.mean_and_sample_variance(values)
    hpd = statistics.empirical_hpd(values)
    print("{:15}: mean = {}, variance = {}, hpd = ({}, {})".format(key, mean, var, hpd[0], hpd[1]))
Beispiel #7
0
    def _calc_community_ecology_stats(
        self,
        phylogenetic_distance_matrix,
        assemblage_memberships,
        assemblage_descriptions,
        report_character_state_specific_results=True,
        report_character_class_wide_results=True,
    ):

        assert len(assemblage_descriptions) == len(assemblage_memberships)

        summary_statistics_suite = collections.OrderedDict()
        results_by_character_class = collections.OrderedDict()
        stat_scores_to_be_harvested = (
            "obs",
            "z",
            "p",
        )  # z = score, p = p-value (turns out this is quite informative)
        for sstbh in stat_scores_to_be_harvested:
            results_by_character_class[sstbh] = collections.defaultdict(list)

        for edge_weighted_desc in ("unweighted", "weighted"):
            if edge_weighted_desc:
                is_weighted_edge_distances = True
            else:
                is_weighted_edge_distances = False
            for underlying_statistic_type_desc in ("mpd", "mntd"):
                if underlying_statistic_type_desc == "mpd":
                    stat_fn_name = "standardized_effect_size_mean_pairwise_distance"
                else:
                    stat_fn_name = "standardized_effect_size_mean_nearest_taxon_distance"
                stat_fn = getattr(phylogenetic_distance_matrix, stat_fn_name)
                try:
                    results_group = stat_fn(
                        assemblage_memberships=assemblage_memberships,
                        is_weighted_edge_distances=is_weighted_edge_distances,
                        is_normalize_by_tree_size=True,
                        num_randomization_replicates=self.
                        num_randomization_replicates,
                    )
                except dendropy.utility.error.SingleTaxonAssemblageException as e:
                    if not report_character_state_specific_results:
                        continue
                    else:
                        raise
                if not report_character_state_specific_results:
                    assert len(results_group) == len(assemblage_memberships)
                if len(results_group) == 0:
                    raise error.IncompleteStateSpaceOccupancyException
                for result, assemblage_desc in zip(results_group,
                                                   assemblage_descriptions):
                    for ses_result_statistic in stat_scores_to_be_harvested:
                        character_class_statistic_prefix = self.stat_name_delimiter.join(
                            [
                                self.stat_name_prefix,
                                "community",
                                "by",
                                assemblage_desc["assemblage_basis_class_id"],
                            ])
                        statistic_subtype_desc = self.stat_name_delimiter.join(
                            [
                                edge_weighted_desc,
                                underlying_statistic_type_desc,
                                # assemblage_desc["assemblage_basis_state_id"],
                            ])
                        character_class_statistic_key = tuple([
                            character_class_statistic_prefix,
                            statistic_subtype_desc
                        ])
                        ses_result_statistic_value = getattr(
                            result, ses_result_statistic)
                        if ses_result_statistic_value is None:
                            continue
                        if report_character_state_specific_results:
                            character_state_statistic_name = self.stat_name_delimiter.join(
                                [
                                    character_class_statistic_prefix,
                                    assemblage_desc[
                                        "assemblage_basis_state_id"],
                                    statistic_subtype_desc,
                                    ses_result_statistic,
                                ])
                            assert character_state_statistic_name not in summary_statistics_suite
                            summary_statistics_suite[
                                character_state_statistic_name] = ses_result_statistic_value
                        if report_character_class_wide_results:
                            results_by_character_class[ses_result_statistic][
                                character_class_statistic_key].append(
                                    ses_result_statistic_value)
        if report_character_class_wide_results:
            for ses_result_statistic in results_by_character_class:
                if len(results_by_character_class[ses_result_statistic]) == 0:
                    continue
                for key in results_by_character_class[ses_result_statistic]:
                    character_class_statistic_prefix, statistic_subtype_desc = key
                    svalues = results_by_character_class[ses_result_statistic][
                        key]
                    mean_var = statistics.mean_and_sample_variance(svalues)
                    for s, sdesc in zip(
                            mean_var,
                        ("mean", "var"),
                    ):
                        sn_title = self.stat_name_delimiter.join([
                            character_class_statistic_prefix,
                            sdesc,
                            statistic_subtype_desc,
                            ses_result_statistic,
                        ])
                        assert sn_title not in summary_statistics_suite
                        summary_statistics_suite[sn_title] = s
        return summary_statistics_suite
Beispiel #8
0
    def tree_from_splits(self,
            split_distribution,
            min_freq=0.5,
            rooted=None,
            include_edge_lengths=True):
        """Returns a consensus tree from splits in ``split_distribution``.

        If include_edge_length_var is True, then the sample variance of the
            edge length will also be calculated and will be stored as
            a length_var attribute.
        """
        taxon_namespace = split_distribution.taxon_namespace
        taxa_mask = taxon_namespace.all_taxa_bitmask()
        if self.weighted_splits:
            split_freqs = split_distribution.weighted_split_frequencies
        else:
            split_freqs = split_distribution.split_frequencies
        if rooted is None:
            if split_distribution.is_all_counted_trees_rooted():
                rooted = True
            elif split_distribution.is_all_counted_trees_strictly_unrooted:
                rooted = False
        #include_edge_lengths = self.support_as_labels and include_edge_lengths
        if self.support_as_edge_lengths and include_edge_lengths:
            raise Exception("Cannot map support as edge lengths if edge lengths are to be set on consensus tree")
        to_try_to_add = []
        _almost_one = lambda x: abs(x - 1.0) <= 0.0000001
        for s in split_freqs:
            freq = split_freqs[s]
            if (min_freq is None) or (freq >= min_freq) or (_almost_one(min_freq) and _almost_one(freq)):
                to_try_to_add.append((freq, s))
        to_try_to_add.sort(reverse=True)
        splits_for_tree = [i[1] for i in to_try_to_add]
        con_tree = dendropy.Tree.from_split_bitmasks(
                split_bitmasks=splits_for_tree,
                taxon_namespace=taxon_namespace,
                is_rooted=rooted)
        con_tree.encode_bipartitions()

        if include_edge_lengths:
            split_edge_lengths = {}
            for split, edges in split_distribution.split_edge_lengths.items():
                if len(edges) > 0:
                    mean, var = mean_and_sample_variance(edges)
                    elen = mean
                else:
                    elen = None
                split_edge_lengths[split] = elen
        else:
            split_edge_lengths = None

        for node in con_tree.postorder_node_iter():
            split = node.edge.bipartition.split_bitmask
            if split in split_freqs:
                self.map_split_support_to_node(node=node, split_support=split_freqs[split])
            if include_edge_lengths and split in split_distribution.split_edge_lengths:
                edges = split_distribution.split_edge_lengths[split]
                if len(edges) > 0:
                    mean, var = mean_and_sample_variance(edges)
                    elen = mean
                else:
                    elen = None
                node.edge.length = elen

        return con_tree
Beispiel #9
0
    def tree_from_splits(self,
                         split_distribution,
                         min_freq=0.5,
                         rooted=None,
                         include_edge_lengths=True):
        """Returns a consensus tree from splits in ``split_distribution``.

        If include_edge_length_var is True, then the sample variance of the
            edge length will also be calculated and will be stored as
            a length_var attribute.
        """
        taxon_namespace = split_distribution.taxon_namespace
        taxa_mask = taxon_namespace.all_taxa_bitmask()
        if self.weighted_splits:
            split_freqs = split_distribution.weighted_split_frequencies
        else:
            split_freqs = split_distribution.split_frequencies
        if rooted is None:
            if split_distribution.is_all_counted_trees_rooted():
                rooted = True
            elif split_distribution.is_all_counted_trees_strictly_unrooted:
                rooted = False
        #include_edge_lengths = self.support_as_labels and include_edge_lengths
        if self.support_as_edge_lengths and include_edge_lengths:
            raise Exception(
                "Cannot map support as edge lengths if edge lengths are to be set on consensus tree"
            )
        to_try_to_add = []
        _almost_one = lambda x: abs(x - 1.0) <= 0.0000001
        for s in split_freqs:
            freq = split_freqs[s]
            if (min_freq is None) or (freq >= min_freq) or (
                    _almost_one(min_freq) and _almost_one(freq)):
                to_try_to_add.append((freq, s))
        to_try_to_add.sort(reverse=True)
        splits_for_tree = [i[1] for i in to_try_to_add]
        con_tree = dendropy.Tree.from_split_bitmasks(
            split_bitmasks=splits_for_tree,
            taxon_namespace=taxon_namespace,
            is_rooted=rooted)
        con_tree.encode_bipartitions()

        if include_edge_lengths:
            split_edge_lengths = {}
            for split, edges in split_distribution.split_edge_lengths.items():
                if len(edges) > 0:
                    mean, var = mean_and_sample_variance(edges)
                    elen = mean
                else:
                    elen = None
                split_edge_lengths[split] = elen
        else:
            split_edge_lengths = None

        for node in con_tree.postorder_node_iter():
            split = node.edge.bipartition.split_bitmask
            if split in split_freqs:
                self.map_split_support_to_node(
                    node=node, split_support=split_freqs[split])
            if include_edge_lengths and split in split_distribution.split_edge_lengths:
                edges = split_distribution.split_edge_lengths[split]
                if len(edges) > 0:
                    mean, var = mean_and_sample_variance(edges)
                    elen = mean
                else:
                    elen = None
                node.edge.length = elen

        return con_tree
Beispiel #10
0
import dendropy
from dendropy.calculate import treemeasure
from dendropy.calculate import statistics

# Since we do not want to waste memory by keeping the actual trees around
# after we are done calculating the statistics, we use the tree yielder
# instead of:
#       dendropy.TreeList.get(
#           path="pythonidae.beast-mcmc.trees",
#           schema="nexus",
#           tree_offset=200)

tree_stats = collections.defaultdict(list)
for tree_idx, tree in enumerate(
        dendropy.Tree.yield_from_files(files=["pythonidae.beast-mcmc.trees"],
                                       schema="nexus")):
    if tree_idx < 200:
        continue  # burnin
    tree_stats["B1"].append(treemeasure.B1(tree))
    tree_stats["colless"].append(treemeasure.colless_tree_imbalance(tree))
    tree_stats["PBH"].append(treemeasure.pybus_harvey_gamma(tree))
    tree_stats["sackin"].append(treemeasure.sackin_index(tree))
    tree_stats["treeness"].append(treemeasure.treeness(tree))

for key in tree_stats:
    values = tree_stats[key]
    mean, var = statistics.mean_and_sample_variance(values)
    hpd = statistics.empirical_hpd(values)
    print("{:15}: mean = {}, variance = {}, hpd = ({}, {})".format(
        key, mean, var, hpd[0], hpd[1]))