def summarize_edge_lengths_on_tree(self, tree, split_distribution, summarization_fn=None, is_bipartitions_updated=False): """ Sets the lengths of edges on ``tree`` (a |Tree| object) to the mean lengths of the corresponding edges on the input trees (in ``split_distribution``, a `SplitDistribution` object) being summarized. ``summarization_fn`` should take an iterable of floats, and return a float. If |None|, it defaults to calculating the mean (``lambda x: float(sum(x))/len(x)``). """ if summarization_fn is None: summarization_fn = lambda x: float(sum(x))/len(x) if not is_bipartitions_updated: tree.encode_bipartitions() for edge in tree.postorder_edge_iter(): split = edge.bipartition.split_bitmask if (split in split_distribution.split_edge_lengths and split_distribution.split_edge_lengths[split]): lengths = split_distribution.split_edge_lengths[split] edge.length = summarization_fn(lengths) elif (split in split_distribution.split_edge_lengths and not split_distribution.split_edge_lengths[split]): # no input trees had any edge lengths for this split edge.length = None else: # split on target tree that was not found in any of the input # trees edge.length = 0.0 return tree ## here we add the support values and/or edge lengths for the terminal taxa ## for node in leaves: if not is_rooted: split = node.edge.split_bitmask else: split = node.edge.leafset_bitmask self.map_split_support_to_node(node, 1.0) if include_edge_lengths: elen = split_distribution.split_edge_lengths.get(split, [0.0]) if len(elen) > 0: mean, var = mean_and_sample_variance(elen) node.edge.length = mean if include_edge_length_var: node.edge.length_var = var else: node.edge.length = None if include_edge_length_var: node.edge.length_var = None #if include_edge_lengths: #self.map_edge_lengths_to_tree(tree=con_tree, # split_distribution=split_distribution, # summarization_fn=summarization_fn, # include_edge_length_var=False) return con_tree
def summarize_edge_lengths_on_tree(self, tree, split_distribution, summarization_fn=None, is_bipartitions_updated=False): """ Sets the lengths of edges on ``tree`` (a |Tree| object) to the mean lengths of the corresponding edges on the input trees (in ``split_distribution``, a `SplitDistribution` object) being summarized. ``summarization_fn`` should take an iterable of floats, and return a float. If |None|, it defaults to calculating the mean (``lambda x: float(sum(x))/len(x)``). """ if summarization_fn is None: summarization_fn = lambda x: float(sum(x)) / len(x) if not is_bipartitions_updated: tree.encode_bipartitions() for edge in tree.postorder_edge_iter(): split = edge.bipartition.split_bitmask if (split in split_distribution.split_edge_lengths and split_distribution.split_edge_lengths[split]): lengths = split_distribution.split_edge_lengths[split] edge.length = summarization_fn(lengths) elif (split in split_distribution.split_edge_lengths and not split_distribution.split_edge_lengths[split]): # no input trees had any edge lengths for this split edge.length = None else: # split on target tree that was not found in any of the input # trees edge.length = 0.0 return tree ## here we add the support values and/or edge lengths for the terminal taxa ## for node in leaves: if not is_rooted: split = node.edge.split_bitmask else: split = node.edge.leafset_bitmask self.map_split_support_to_node(node, 1.0) if include_edge_lengths: elen = split_distribution.split_edge_lengths.get(split, [0.0]) if len(elen) > 0: mean, var = mean_and_sample_variance(elen) node.edge.length = mean if include_edge_length_var: node.edge.length_var = var else: node.edge.length = None if include_edge_length_var: node.edge.length_var = None #if include_edge_lengths: #self.map_edge_lengths_to_tree(tree=con_tree, # split_distribution=split_distribution, # summarization_fn=summarization_fn, # include_edge_length_var=False) return con_tree
def _calculate_standardized_effect_size(self, statisticf_name, statisticf_kwargs=None, assemblage_memberships=None, null_model_type="taxa.label", num_randomization_replicates=1000, rng=None): result_type = collections.namedtuple("PhylogeneticCommunityStandardizedEffectSizeStatisticCalculationResult", ["obs", "null_model_mean", "null_model_sd", "z", "rank", "p",]) if assemblage_memberships is None: assemblage_memberships = [ set(self._mapped_taxa) ] if statisticf_kwargs is None: statisticf_kwargs = {} observed_stat_values = {} null_model_stat_values = {} null_model_matrix = self.clone() assert null_model_matrix == self for rep_idx in range(num_randomization_replicates): null_model_matrix.shuffle_taxa(rng=rng) for community_idx, assemblage_membership in enumerate(assemblage_memberships): filter_fn = lambda taxon: taxon in assemblage_membership statisticf_kwargs["filter_fn"] = filter_fn if rep_idx == 0: observed_stat_values[community_idx] = getattr(self, statisticf_name)(**statisticf_kwargs) null_model_stat_values[community_idx] = [] stat_value = getattr(null_model_matrix, statisticf_name)(**statisticf_kwargs) null_model_stat_values[community_idx].append(stat_value) results = [] for community_idx, assemblage_membership in enumerate(assemblage_memberships): obs_value = observed_stat_values[community_idx] stat_values = null_model_stat_values[community_idx] null_model_mean, null_model_var = statistics.mean_and_sample_variance(stat_values) rank = statistics.rank( value_to_be_ranked=obs_value, value_providing_rank=stat_values) if null_model_var > 0: null_model_sd = math.sqrt(null_model_var) z = (obs_value - null_model_mean) / null_model_sd else: null_model_sd = 0.0 z = None p = float(rank) / len(stat_values) result = result_type( obs=obs_value, null_model_mean=null_model_mean, null_model_sd=null_model_sd, z=z, rank=rank, p=p) results.append(result) return results
def calc_rfd_distribution(src_path): tns = dendropy.TaxonNamespace() trees = dendropy.TreeList.get( path=src_path, schema="nexus") rf_dists = [] for idx1, t1 in enumerate(trees[:-1]): for idx2, t2 in enumerate(trees[idx1+1:]): rfd = treecompare.unweighted_robinson_foulds_distance(t1, t2) rf_dists.append(rfd) mean, var = statistics.mean_and_sample_variance(rf_dists) print("mean = {}, var = {}, 5/95% quantile = {}".format( mean, var, statistics.quantile_5_95(rf_dists)))
def _calc_community_ecology_stats(self, phylogenetic_distance_matrix, assemblage_memberships, assemblage_descriptions, report_character_state_specific_results=True, report_character_class_wide_results=True, ): assert len(assemblage_descriptions) == len(assemblage_memberships) summary_statistics_suite = {} results_by_character_class = {} stat_scores_to_be_harvested = ("obs", "z", "p",) # z = score, p = p-value (turns out this is quite informative) for sstbh in stat_scores_to_be_harvested: results_by_character_class[sstbh] = collections.defaultdict(list) for edge_weighted_desc in ("unweighted", "weighted"): if edge_weighted_desc: is_weighted_edge_distances = True else: is_weighted_edge_distances = False for underlying_statistic_type_desc in ("mpd", "mntd"): if underlying_statistic_type_desc == "mpd": stat_fn_name = "standardized_effect_size_mean_pairwise_distance" else: stat_fn_name = "standardized_effect_size_mean_nearest_taxon_distance" stat_fn = getattr(phylogenetic_distance_matrix, stat_fn_name) results_group = stat_fn( assemblage_memberships=assemblage_memberships, is_weighted_edge_distances=is_weighted_edge_distances, is_normalize_by_tree_size=True, num_randomization_replicates=self.num_randomization_replicates, ) assert len(results_group) == len(assemblage_memberships) for result, assemblage_desc in zip(results_group, assemblage_descriptions): for ses_result_statistic in stat_scores_to_be_harvested: character_class_statistic_prefix = self.stat_name_delimiter.join([ self.stat_name_prefix, "community", "by", assemblage_desc["assemblage_basis_class_id"], ]) statistic_subtype_desc = self.stat_name_delimiter.join([ edge_weighted_desc, underlying_statistic_type_desc, # assemblage_desc["assemblage_basis_state_id"], ]) character_class_statistic_key = tuple([character_class_statistic_prefix, statistic_subtype_desc]) ses_result_statistic_value = getattr(result, ses_result_statistic) if ses_result_statistic_value is None: continue if report_character_state_specific_results: character_state_statistic_name = self.stat_name_delimiter.join([ character_class_statistic_prefix, assemblage_desc["assemblage_basis_state_id"], statistic_subtype_desc, ses_result_statistic, ]) assert character_state_statistic_name not in summary_statistics_suite summary_statistics_suite[character_state_statistic_name] = ses_result_statistic_value if report_character_class_wide_results: results_by_character_class[ses_result_statistic][character_class_statistic_key].append(ses_result_statistic_value) if report_character_class_wide_results: for ses_result_statistic in results_by_character_class: if len(results_by_character_class[ses_result_statistic]) == 0: continue for key in results_by_character_class[ses_result_statistic]: character_class_statistic_prefix, statistic_subtype_desc = key svalues = results_by_character_class[ses_result_statistic][key] mean_var = statistics.mean_and_sample_variance(svalues) for s, sdesc in zip( mean_var, ("mean", "var"), ): sn_title = self.stat_name_delimiter.join([ character_class_statistic_prefix, sdesc, statistic_subtype_desc, ses_result_statistic, ]) assert sn_title not in summary_statistics_suite summary_statistics_suite[sn_title] = s return summary_statistics_suite
import collections import dendropy from dendropy.calculate import treemeasure from dendropy.calculate import statistics # Since we do not want to waste memory by keeping the actual trees around # after we are done calculating the statistics, we use the tree yielder # instead of: # dendropy.TreeList.get( # path="pythonidae.beast-mcmc.trees", # schema="nexus", # tree_offset=200) tree_stats = collections.defaultdict(list) for tree_idx, tree in enumerate(dendropy.Tree.yield_from_files( files=["pythonidae.beast-mcmc.trees"], schema="nexus")): if tree_idx < 200: continue # burnin tree_stats["B1"].append(treemeasure.B1(tree)) tree_stats["colless"].append(treemeasure.colless_tree_imbalance(tree)) tree_stats["PBH"].append(treemeasure.pybus_harvey_gamma(tree)) tree_stats["sackin"].append(treemeasure.sackin_index(tree)) tree_stats["treeness"].append(treemeasure.treeness(tree)) for key in tree_stats: values = tree_stats[key] mean, var = statistics.mean_and_sample_variance(values) hpd = statistics.empirical_hpd(values) print("{:15}: mean = {}, variance = {}, hpd = ({}, {})".format(key, mean, var, hpd[0], hpd[1]))
def _calc_community_ecology_stats( self, phylogenetic_distance_matrix, assemblage_memberships, assemblage_descriptions, report_character_state_specific_results=True, report_character_class_wide_results=True, ): assert len(assemblage_descriptions) == len(assemblage_memberships) summary_statistics_suite = collections.OrderedDict() results_by_character_class = collections.OrderedDict() stat_scores_to_be_harvested = ( "obs", "z", "p", ) # z = score, p = p-value (turns out this is quite informative) for sstbh in stat_scores_to_be_harvested: results_by_character_class[sstbh] = collections.defaultdict(list) for edge_weighted_desc in ("unweighted", "weighted"): if edge_weighted_desc: is_weighted_edge_distances = True else: is_weighted_edge_distances = False for underlying_statistic_type_desc in ("mpd", "mntd"): if underlying_statistic_type_desc == "mpd": stat_fn_name = "standardized_effect_size_mean_pairwise_distance" else: stat_fn_name = "standardized_effect_size_mean_nearest_taxon_distance" stat_fn = getattr(phylogenetic_distance_matrix, stat_fn_name) try: results_group = stat_fn( assemblage_memberships=assemblage_memberships, is_weighted_edge_distances=is_weighted_edge_distances, is_normalize_by_tree_size=True, num_randomization_replicates=self. num_randomization_replicates, ) except dendropy.utility.error.SingleTaxonAssemblageException as e: if not report_character_state_specific_results: continue else: raise if not report_character_state_specific_results: assert len(results_group) == len(assemblage_memberships) if len(results_group) == 0: raise error.IncompleteStateSpaceOccupancyException for result, assemblage_desc in zip(results_group, assemblage_descriptions): for ses_result_statistic in stat_scores_to_be_harvested: character_class_statistic_prefix = self.stat_name_delimiter.join( [ self.stat_name_prefix, "community", "by", assemblage_desc["assemblage_basis_class_id"], ]) statistic_subtype_desc = self.stat_name_delimiter.join( [ edge_weighted_desc, underlying_statistic_type_desc, # assemblage_desc["assemblage_basis_state_id"], ]) character_class_statistic_key = tuple([ character_class_statistic_prefix, statistic_subtype_desc ]) ses_result_statistic_value = getattr( result, ses_result_statistic) if ses_result_statistic_value is None: continue if report_character_state_specific_results: character_state_statistic_name = self.stat_name_delimiter.join( [ character_class_statistic_prefix, assemblage_desc[ "assemblage_basis_state_id"], statistic_subtype_desc, ses_result_statistic, ]) assert character_state_statistic_name not in summary_statistics_suite summary_statistics_suite[ character_state_statistic_name] = ses_result_statistic_value if report_character_class_wide_results: results_by_character_class[ses_result_statistic][ character_class_statistic_key].append( ses_result_statistic_value) if report_character_class_wide_results: for ses_result_statistic in results_by_character_class: if len(results_by_character_class[ses_result_statistic]) == 0: continue for key in results_by_character_class[ses_result_statistic]: character_class_statistic_prefix, statistic_subtype_desc = key svalues = results_by_character_class[ses_result_statistic][ key] mean_var = statistics.mean_and_sample_variance(svalues) for s, sdesc in zip( mean_var, ("mean", "var"), ): sn_title = self.stat_name_delimiter.join([ character_class_statistic_prefix, sdesc, statistic_subtype_desc, ses_result_statistic, ]) assert sn_title not in summary_statistics_suite summary_statistics_suite[sn_title] = s return summary_statistics_suite
def tree_from_splits(self, split_distribution, min_freq=0.5, rooted=None, include_edge_lengths=True): """Returns a consensus tree from splits in ``split_distribution``. If include_edge_length_var is True, then the sample variance of the edge length will also be calculated and will be stored as a length_var attribute. """ taxon_namespace = split_distribution.taxon_namespace taxa_mask = taxon_namespace.all_taxa_bitmask() if self.weighted_splits: split_freqs = split_distribution.weighted_split_frequencies else: split_freqs = split_distribution.split_frequencies if rooted is None: if split_distribution.is_all_counted_trees_rooted(): rooted = True elif split_distribution.is_all_counted_trees_strictly_unrooted: rooted = False #include_edge_lengths = self.support_as_labels and include_edge_lengths if self.support_as_edge_lengths and include_edge_lengths: raise Exception("Cannot map support as edge lengths if edge lengths are to be set on consensus tree") to_try_to_add = [] _almost_one = lambda x: abs(x - 1.0) <= 0.0000001 for s in split_freqs: freq = split_freqs[s] if (min_freq is None) or (freq >= min_freq) or (_almost_one(min_freq) and _almost_one(freq)): to_try_to_add.append((freq, s)) to_try_to_add.sort(reverse=True) splits_for_tree = [i[1] for i in to_try_to_add] con_tree = dendropy.Tree.from_split_bitmasks( split_bitmasks=splits_for_tree, taxon_namespace=taxon_namespace, is_rooted=rooted) con_tree.encode_bipartitions() if include_edge_lengths: split_edge_lengths = {} for split, edges in split_distribution.split_edge_lengths.items(): if len(edges) > 0: mean, var = mean_and_sample_variance(edges) elen = mean else: elen = None split_edge_lengths[split] = elen else: split_edge_lengths = None for node in con_tree.postorder_node_iter(): split = node.edge.bipartition.split_bitmask if split in split_freqs: self.map_split_support_to_node(node=node, split_support=split_freqs[split]) if include_edge_lengths and split in split_distribution.split_edge_lengths: edges = split_distribution.split_edge_lengths[split] if len(edges) > 0: mean, var = mean_and_sample_variance(edges) elen = mean else: elen = None node.edge.length = elen return con_tree
def tree_from_splits(self, split_distribution, min_freq=0.5, rooted=None, include_edge_lengths=True): """Returns a consensus tree from splits in ``split_distribution``. If include_edge_length_var is True, then the sample variance of the edge length will also be calculated and will be stored as a length_var attribute. """ taxon_namespace = split_distribution.taxon_namespace taxa_mask = taxon_namespace.all_taxa_bitmask() if self.weighted_splits: split_freqs = split_distribution.weighted_split_frequencies else: split_freqs = split_distribution.split_frequencies if rooted is None: if split_distribution.is_all_counted_trees_rooted(): rooted = True elif split_distribution.is_all_counted_trees_strictly_unrooted: rooted = False #include_edge_lengths = self.support_as_labels and include_edge_lengths if self.support_as_edge_lengths and include_edge_lengths: raise Exception( "Cannot map support as edge lengths if edge lengths are to be set on consensus tree" ) to_try_to_add = [] _almost_one = lambda x: abs(x - 1.0) <= 0.0000001 for s in split_freqs: freq = split_freqs[s] if (min_freq is None) or (freq >= min_freq) or ( _almost_one(min_freq) and _almost_one(freq)): to_try_to_add.append((freq, s)) to_try_to_add.sort(reverse=True) splits_for_tree = [i[1] for i in to_try_to_add] con_tree = dendropy.Tree.from_split_bitmasks( split_bitmasks=splits_for_tree, taxon_namespace=taxon_namespace, is_rooted=rooted) con_tree.encode_bipartitions() if include_edge_lengths: split_edge_lengths = {} for split, edges in split_distribution.split_edge_lengths.items(): if len(edges) > 0: mean, var = mean_and_sample_variance(edges) elen = mean else: elen = None split_edge_lengths[split] = elen else: split_edge_lengths = None for node in con_tree.postorder_node_iter(): split = node.edge.bipartition.split_bitmask if split in split_freqs: self.map_split_support_to_node( node=node, split_support=split_freqs[split]) if include_edge_lengths and split in split_distribution.split_edge_lengths: edges = split_distribution.split_edge_lengths[split] if len(edges) > 0: mean, var = mean_and_sample_variance(edges) elen = mean else: elen = None node.edge.length = elen return con_tree
import dendropy from dendropy.calculate import treemeasure from dendropy.calculate import statistics # Since we do not want to waste memory by keeping the actual trees around # after we are done calculating the statistics, we use the tree yielder # instead of: # dendropy.TreeList.get( # path="pythonidae.beast-mcmc.trees", # schema="nexus", # tree_offset=200) tree_stats = collections.defaultdict(list) for tree_idx, tree in enumerate( dendropy.Tree.yield_from_files(files=["pythonidae.beast-mcmc.trees"], schema="nexus")): if tree_idx < 200: continue # burnin tree_stats["B1"].append(treemeasure.B1(tree)) tree_stats["colless"].append(treemeasure.colless_tree_imbalance(tree)) tree_stats["PBH"].append(treemeasure.pybus_harvey_gamma(tree)) tree_stats["sackin"].append(treemeasure.sackin_index(tree)) tree_stats["treeness"].append(treemeasure.treeness(tree)) for key in tree_stats: values = tree_stats[key] mean, var = statistics.mean_and_sample_variance(values) hpd = statistics.empirical_hpd(values) print("{:15}: mean = {}, variance = {}, hpd = ({}, {})".format( key, mean, var, hpd[0], hpd[1]))