def testChangeTranslate(self): f = """#NEXUS Begin taxa ; dimensions ntax = 4; taxlabels a b c d ; end; begin trees; translate 1 a, 2 b, 3 c, 4 d; tree t = (1,2,(3,4)); end; begin trees; translate 1 d, 2 b, 3 c, 4 a; tree t = (4,2,(3,1)); end; """ d = Dataset() d.read(StringIO(f), format="NEXUS") t = d.trees_blocks[0][0] s = d.trees_blocks[1][0] self.assertEqual(t.taxa_block, s.taxa_block) encode_splits(s) encode_splits(t) self.assertEqual(treedists.symmetric_difference(t, s), 0)
def store_chars(char_block, format, dest=None): "Writes the CharacterBlock `char_block` to `dest` using writer." deprecation("'dataio.store_chars()' is deprecated: use 'write()' method of a Dataset object instead", logger_obj=_LOG) dataset = Dataset() dataset.add_char_block(char_block=char_block) return store_dataset(dataset=dataset, format=format, dest=dest)
def testPostOrderAgain(self): n = '((((t49:0.0299,t41:0.0299):0.42017,(((t39:0.231767,(t7:0.213631,(t36:0.099739,((t12:0.015337,t26:0.015337):0.036523,t8:0.05186):0.047879):0.113892):0.018137):0.062275,((t43:0.240402,(((t28:0.157872,((t1:0.132296,t46:0.132296):0.016383,(t14:0.12343,t3:0.12343):0.025249):0.009193):0.01427,((t24:0.151816,(t25:0.106007,(t32:0.035143,(t47:0.025662,t6:0.025662):0.00948):0.070864):0.045809):0.013127,t40:0.164943):0.007199):0.010417,(t34:0.159553,(((t21:0.061001,t16:0.061001):0.001043,t44:0.062044):0.011229,(t18:0.001174,t5:0.001174):0.072098):0.086281):0.023006):0.057844):0.027253,(t27:0.066806,t50:0.066806):0.20085):0.026387):0.117517,(t23:0.259881,(t4:0.072245,((t45,t30):0.029839,t31:0.029839):0.042406):0.187635):0.151679):0.038511):0.069304,(t19:0.25004,t17:0.25004):0.269335):0.215312,((((t37:0.036909,t13:0.036909):0.244651,t2:0.28156):0.262824,((t15:0.11244,(t33:0.076665,t10:0.076665):0.035775):0.124232,t9:0.236671):0.307712):0.057112,(((t22:0.065248,t42:0.065248):0.037237,t38:0.102485):0.182409,(t48:0.187654,((t20:0.03615,(t29:0.01626,t11:0.01626):0.01989):0.039894,t35:0.076044):0.11161):0.09724):0.316601):0.133191);' d = Dataset() tree = d.trees_from_string(string=n, format="NEWICK")[0] tree.debug_check_tree() used = set() for node in tree.postorder_node_iter(): ch = node.child_nodes() for c in ch: self.assertTrue(c in used) used.add(node)
def trees_from_newick(nl, taxa_block=None): """Takes an iterable list of newick strings (or files with just newick strings in them. """ reader = get_reader(NEWICK) if taxa_block is not None: dataset = Dataset(taxa_blocks=[taxa_block]) else: dataset = Dataset() for t in nl: f = t if isinstance(t, str): f = StringIO(t) reader.read_dataset(file_obj=f, dataset=dataset) return dataset
def store_trees(trees_collection, format, dest=None): "Writes the list of trees `trees` to `dest` using writer." deprecation("'dataio.store_trees()' is deprecated: use 'write()' method of a Dataset object instead", logger_obj=_LOG) if isinstance(trees_collection, TreesBlock): trees_block = trees_collection else: trees_block = TreesBlock() for tree in trees_collection: trees_block.append(tree) trees_block.normalize_taxa() dataset = Dataset() dataset.add_trees_block(trees_block=trees_block) return store_dataset(dataset=dataset, format=format, dest=dest)
def testKTBEvolveLinearBounce(self): rng = DebuggingRandom() newick = "((t5:1611.75,t6:1611.75):3922.93,((t4:1043.81,(t2:754.11,t1:754.11):2896.9):6584.0,t3:1702.21):3832.47);" d = Dataset() tree = d.trees_from_string(string=newick, format="NEWICK")[0] root = tree.seed_node root.mutation_rate = 1e-5 root.mean_edge_rate = root.mutation_rate simulate_continuous(root, rng, roeotroe=0.01, min_rate=1.0e-6, max_rate=1.0e-3, model='KTB', time_attr='edge_length', val_attr='mutation_rate', mean_val_attr='mean_edge_rate', constrain_rate_mode="linear_bounce") for i in tree.preorder_node_iter(): if i.edge_length is not None: i.edge_length *= i.mean_edge_rate
def _write_garli_input(self, full_dataset): assert len(full_dataset.taxa_blocks) == 1 taxa = full_dataset.taxa_blocks[0] assert (len(full_dataset.char_blocks) == 1) characters = full_dataset.char_blocks[0] assert (len(characters) == len(taxa)) culled_taxa = TaxaBlock(taxa[:self.curr_n_taxa]) culled_chars = copy.copy(characters) culled_chars.taxa_block = culled_taxa culled_chars.matrix = copy.copy(characters.matrix) culled_chars.matrix.clear() #culled_chars = characters.__class__(taxa_block=culled_taxa) #culled_chars.column_types = characters.column_types #culled_chars.markup_as_sequences = characters.markup_as_sequences template_matrix = characters.matrix for taxon in culled_taxa: culled_chars.matrix[taxon] = template_matrix[taxon] culled = Dataset() culled.taxa_blocks.append(culled_taxa) culled.char_blocks.append(culled_chars) o = open(self.datafname, "w") nexusWriter = nexus.NexusWriter() nexusWriter.write_dataset(culled, o) o.close() return culled
def __init__(self, sources=[], core_iterator=None, taxa_block=None, dataset=None, format=None, from_index=0, progress_func=None, **kwargs): """An iterable collection of trees from multiple sources `sources` is as list of tree sources each can be either a file path (a str) or a file-like object Either `core_iterator` or `dataset` must be specified as the source of the iterator If the `dataset` is used, the the `format` must be specified `from_index` can be used to skip the first `from_index` trees from _EACH_ file this is useful if you want to discard a certain number of trees from the beginning of each run as burnin. If `progress_func` is specified verbose messages will be sent to it for every tree processed. """ if dataset is None: self.dataset = Dataset() else: self.dataset = dataset self.taxa_block = taxa_block self.format = format if core_iterator is None: if self.format is None: raise ValueError("Either 'core_iterator' or 'format' flags must be used") self.using_data_it = True else: self.using_data_it = False self._core_iterator = core_iterator self.progress_func = progress_func self.sources = sources self.total_trees_read = 0 self.total_trees_ignored = 0 self.total_num_sources_read = 0 self.from_index = from_index self.iterator_kwargs = kwargs
def testTaxaWithUnderscoreRead(self): rd = dendropy.tests.data_source_path("rana.nex") rt = dendropy.tests.data_source_path("rana.tre") d = Dataset() d.read(open(rd, "rU"), format="NEXUS") self.assertEqual(len(d.taxa_blocks[0]), 64) d.read_trees(open(rt, "rU"), format="NEXUS") self.assertEqual(len(d.taxa_blocks[0]), 64)
def test3Feb2009MajRuleBug(self): if not is_test_enabled(TestLevel.NORMAL, _LOG, module_name=__name__, message="skipping sumtree argument tests"): return fn1 = dendropy.tests.data_source_path("maj-rule-bug1.tre") fn2 = dendropy.tests.data_source_path("maj-rule-bug2.tre") d = Dataset() tb1 = d.read_trees(open(fn1, "rU"), format="NEXUS", encode_splits=True, rooted=RootingInterpretation.UNROOTED) tb2 = d.read_trees(open(fn2, "rU"), format="NEXUS", encode_splits=True, rooted=RootingInterpretation.UNROOTED) taxa1 = d.taxa_blocks[0] self.assertEqual(taxa1, tb2[0].taxa_block) firstSD = SplitDistribution(taxa_block=taxa1) secondSD = SplitDistribution(taxa_block=taxa1) for o, t in itertools.izip(tb1, tb2): #encode_splits(o) #encode_splits(t) firstSD.count_splits_on_tree(o) secondSD.count_splits_on_tree(t) ts = TreeSummarizer() n_times = 1 # keep set to 1 except for benchmarking tree_from_splits for i in xrange(n_times): firstMR = ts.tree_from_splits(firstSD, min_freq=0.5) secondMR = ts.tree_from_splits(secondSD, min_freq=0.5) self.assertEqual(0, symmetric_difference(firstMR, secondMR))
def testStoreEdgeLens(self): n = '((((((t4:0.06759,t32:0.06759):0.198252,t39:0.265842):0.135924,((t9:0.244134,(((t23:0.014408,t49:0.014408):0.040121,t16:0.05453):0.156614,t2:0.211144):0.03299):0.013224,t34:0.257358):0.144408):0.112116,(((t45:0.110713,(t47:0.019022,t8:0.019022):0.09169):0.163042,((t1:0.168924,(((((t15:0.012356,t30:0.012356):0.000247,t18:0.012603):0.037913,t22:0.050516):0.076193,(t44:0.071301,t46:0.071301):0.055407):0.037072,(((((t50:0.00000,t29:0.00000):0.01744,t35:0.017441):0.066422,t10:0.083863):0.047231,((t6:0.012709,(t26:0.00805,t40:0.00805):0.004659):0.043941,t11:0.05665):0.074443):0.008316,t31:0.13941):0.024371):0.005144):0.025169,t33:0.194093):0.079662):0.183823,(t48:0.343218,((t41:0.032738,t27:0.032738):0.229887,((t5:0.030394,t43:0.030394):0.204863,((((t14:0.028794,t24:0.028794):0.002007,t3:0.030801):0.181488,t38:0.212289):0.017427,(t17:0.01869,t25:0.01869):0.211027):0.005541):0.027368):0.080592):0.11436):0.056304):0.078832,(t21:0.107754,t13:0.107754):0.48496):0.114273,((t36:0.352531,(((t12:0.042324,t7:0.042324):0.155519,t19:0.197843):0.016322,(t37:0.12614,t28:0.12614):0.088025):0.138366):0.147704,(t42:0.088633,t20:0.088633):0.411601):0.206753)' d = Dataset() tree = d.trees_from_string(string=n, format="NEWICK")[0] for nd in tree.postorder_node_iter(): if nd is not tree.seed_node: if nd.edge.length is None: _LOG.info("%s has edge length of None" % trees.format_node(nd)) self.assertTrue(nd.edge.length is not None) ts = str(tree) d = Dataset() tree = d.trees_from_string(string=ts, format="NEWICK")[0] for nd in tree.postorder_node_iter(): if nd is not tree.seed_node: if nd.edge.length is None: _LOG.warn("%s has edge length of None" % trees.format_node(nd)) self.assertTrue(nd.edge.length is not None)
last_split = 1 << (n_tax - 1) all_taxa_bitmask = (1 << n_tax) - 1 add_trees_fn = sys.argv[2] assert len(sys.argv) > 3 nbhd_tree_groups = [] for nbhd_tree_fn in sys.argv[3:]: nbhd_tree_f = open(nbhd_tree_fn, 'rU') nbhd_tree_groups.extend(read_add_tree_groups(nbhd_tree_f)) add_trees_f = open(add_trees_fn, 'rU') all_tree_groups = read_add_tree_groups(add_trees_f) taxa_block = TaxaBlock([str(i + 1) for i in range(n_tax)]) taxa_blocks = [taxa_block] dataset = Dataset(taxa_blocks=taxa_blocks) #setting this > 1.0 means that more trees are retained to the neighborhood search stage score_diff_multiplier = 1.0 commands = [] # first we collect all of the ParsedTree objects into all_parsed_trees and we # call encode_splits so that we can look up split info on each tree all_tree_groups.extend(nbhd_tree_groups) all_parsed_trees = [] for g in all_tree_groups: for el in g: newick_string = el.tree_string newick_stream = StringIO(newick_string) t = dataset.read_trees(newick_stream, format="newick")[0] encode_splits(t)
class MultiFileTreeIterator(object): def __init__(self, sources=[], core_iterator=None, taxa_block=None, dataset=None, format=None, from_index=0, progress_func=None, **kwargs): """An iterable collection of trees from multiple sources `sources` is as list of tree sources each can be either a file path (a str) or a file-like object Either `core_iterator` or `dataset` must be specified as the source of the iterator If the `dataset` is used, the the `format` must be specified `from_index` can be used to skip the first `from_index` trees from _EACH_ file this is useful if you want to discard a certain number of trees from the beginning of each run as burnin. If `progress_func` is specified verbose messages will be sent to it for every tree processed. """ if dataset is None: self.dataset = Dataset() else: self.dataset = dataset self.taxa_block = taxa_block self.format = format if core_iterator is None: if self.format is None: raise ValueError("Either 'core_iterator' or 'format' flags must be used") self.using_data_it = True else: self.using_data_it = False self._core_iterator = core_iterator self.progress_func = progress_func self.sources = sources self.total_trees_read = 0 self.total_trees_ignored = 0 self.total_num_sources_read = 0 self.from_index = from_index self.iterator_kwargs = kwargs def __iter__(self): si = self.from_index tb = self.taxa_block progress_func = self.progress_func self.curr_trees_read = 0 self.curr_trees_ignored = 0 self.curr_num_sources_read = 0 for source_ind, tree_source in enumerate(self.sources): if isinstance(tree_source, str): fo = open(tree_source, "rU") else: fo = tree_source if progress_func: current_file_note = "Tree file %d of %d: " % (source_ind + 1, len(self.sources)) self.curr_num_sources_read += 1 self.total_num_sources_read += 1 for n, tree in enumerate(self._raw_iter(fo, tb)): if (not si) or (n >= si): if tb is None: tb = tree.taxa_block self.total_trees_read += 1 self.curr_trees_read += 1 if progress_func: progress_func("%sProcessing tree %d" % (current_file_note, (n+1))) yield tree else: self.total_trees_ignored += 1 self.curr_trees_ignored += 1 if progress_func: progress_func("%sSkipping tree %d (# to skip=%d)" % (current_file_note, (n+1), si)) def _raw_iter(self, fo, tb): if self.using_data_it: for tree in self.dataset.iterate_over_trees(fo, taxa_block=tb, format=self.format, **(self.iterator_kwargs)): yield tree else: for tree in self._core_iterator(fo, taxa_block=tb, file_format=self.format, **(self.iterator_kwargs)): yield tree
def main_cli(): description = '%s %s ' % (_program_name, _program_version) usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]" parser = OptionParser(usage=usage, add_help_option=True, version = _program_version, description=description) parser.add_option('-r','--reference', dest='reference_tree_filepath', default=None, help="path to file containing the reference (true) tree") parser.add_option('-v', '--verbose', action='store_false', dest='quiet', default=True, help="Verbose mode") (opts, args) = parser.parse_args() ################################################### # Support file idiot checking sampled_filepaths = [] missing = False for fpath in args: fpath = os.path.expanduser(os.path.expandvars(fpath)) if not os.path.exists(fpath): sys.exit('Sampled trees file not found: "%s"' % fpath) sampled_filepaths.append(fpath) if not sampled_filepaths: sys.exit("Expecting arguments indicating files that contain sampled trees") sampled_file_objs = [open(f, "rU") for f in sampled_filepaths] ################################################### # Lots of other idiot-checking ... # target tree if opts.reference_tree_filepath is None: sys.exit("A reference tree must be specified (use -h to see all options)") reference_tree_filepath = os.path.expanduser(os.path.expandvars(opts.reference_tree_filepath)) if not os.path.exists(reference_tree_filepath): sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath) d = Dataset() ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS") if len(ref_trees) != 1: sys.exit("Expecting one reference tree") ref_tree = ref_trees[0] splits.encode_splits(ref_tree) assert(len(d.taxa_blocks) == 1) taxa = d.taxa_blocks[0] ################################################### # Main work begins here: Count the splits start_time = datetime.datetime.now() comments = [] tsum = treesum.TreeSummarizer() tsum.burnin = 0 if opts.quiet: tsum.verbose = False tsum.write_message = None else: tsum.verbose = True tsum.write_message = sys.stderr.write _LOG.debug("### COUNTING SPLITS ###\n") split_distribution = splits.SplitDistribution(taxa_block=taxa) tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees) tsum.count_splits_on_trees(tree_source, split_distribution) report = [] report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths))) report.append("%d trees ignored in total." % (tree_source.total_trees_ignored)) report.append("%d trees considered in total for split support assessment." % (tsum.total_trees_counted)) report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block)) num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered() report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits)) report.append("%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits)) _LOG.debug("\n".join(report)) con_tree = treegen.star_tree(taxa) taxa_mask = taxa.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf unrooted = True n_read = float(tsum.total_trees_read) sp_list = [] for split, count in split_distribution.split_counts.iteritems(): freq = count/n_read if not splits.is_trivial_split(split, taxa_mask): m = split & taxa_mask if (m != taxa_mask) and ((m-1) & m): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c-1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m sp_list.append((freq, k, m)) else: sp_list.append((freq, m, m)) sp_list.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge curr_freq = 1.1 curr_all_splits_list = [] curr_compat_splits_list = [] all_splits_by_freq = [] compat_splits_by_freq = [] # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in sp_list: if abs(curr_freq-freq) > 0.000001: # dropping down to the next lowest freq curr_l = [freq, []] curr_all_splits_list = curr_l[1] all_splits_by_freq.append(curr_l) curr_l = [freq, []] curr_compat_splits_list = curr_l[1] compat_splits_by_freq.append(curr_l) curr_freq = freq curr_all_splits_list.append(split_to_add) if (split_to_add & root_edge.clade_mask) != split_to_add: continue lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add ): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge curr_compat_splits_list.append(split_to_add) ref_set = set() for s in ref_tree.split_edges.iterkeys(): m = s & taxa_mask if 1 & m: k = (~m) & taxa_mask else: k = m if not splits.is_trivial_split(k, taxa_mask): ref_set.add(k) all_set = set() compat_set = set() _LOG.debug("%d edges is the reference tree" % (len(ref_set))) print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD" for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq): freq = all_el[0] all_sp = all_el[1] all_set.update(all_sp) all_fn = len(ref_set - all_set) all_fp = len(all_set - ref_set) compat_sp = compat_el[1] compat_set.update(compat_sp) compat_fn = len(ref_set - compat_set) compat_fp = len(compat_set - ref_set) print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn )
VERBOSE = True _LOG.setLevel(logging.DEBUG) data_file = opts.data_filepath intree_file = opts.intree_filepath if data_file is None: sys.exit("Data file must be specified") if intree_file is None: sys.exit("Input tree file must be specified") for f in [data_file, intree_file, conf_file]: if not os.path.exists(f): sys.exit("%s does not exist" % f) garli = GarliConf() garli.read_garli_conf(open(conf_file, "rU")) full_dataset = Dataset() full_dataset.read(open(data_file, "rU"), format="NEXUS") assert(len(full_dataset.taxa_blocks) == 1) taxa = full_dataset.taxa_blocks[0] full_taxa_mask = taxa.all_taxa_bitmask() for n, taxon in enumerate(taxa): TAXON_TO_TRANSLATE[taxon] = str(n + 1) _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask)) garli.datafname = os.path.join("data.nex") raw_trees = full_dataset.read_trees(open(intree_file, "rU"), format="NEXUS") assert(raw_trees) current_taxon_mask = None
def testPHGamma(self): newick = "((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247);" d = Dataset() tree = d.trees_from_string(string=newick, format="NEWICK")[0] assert_approx_equal(pybus_harvey_gamma(tree), 0.546276)
def main_cli(): description = '%s %s ' % (_program_name, _program_version) usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]" parser = OptionParser(usage=usage, add_help_option=True, version=_program_version, description=description) parser.add_option('-r', '--reference', dest='reference_tree_filepath', default=None, help="path to file containing the reference (true) tree") parser.add_option('-v', '--verbose', action='store_false', dest='quiet', default=True, help="Verbose mode") (opts, args) = parser.parse_args() ################################################### # Support file idiot checking sampled_filepaths = [] missing = False for fpath in args: fpath = os.path.expanduser(os.path.expandvars(fpath)) if not os.path.exists(fpath): sys.exit('Sampled trees file not found: "%s"' % fpath) sampled_filepaths.append(fpath) if not sampled_filepaths: sys.exit( "Expecting arguments indicating files that contain sampled trees") sampled_file_objs = [open(f, "rU") for f in sampled_filepaths] ################################################### # Lots of other idiot-checking ... # target tree if opts.reference_tree_filepath is None: sys.exit( "A reference tree must be specified (use -h to see all options)") reference_tree_filepath = os.path.expanduser( os.path.expandvars(opts.reference_tree_filepath)) if not os.path.exists(reference_tree_filepath): sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath) d = Dataset() ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS") if len(ref_trees) != 1: sys.exit("Expecting one reference tree") ref_tree = ref_trees[0] splits.encode_splits(ref_tree) assert (len(d.taxa_blocks) == 1) taxa = d.taxa_blocks[0] ################################################### # Main work begins here: Count the splits start_time = datetime.datetime.now() comments = [] tsum = treesum.TreeSummarizer() tsum.burnin = 0 if opts.quiet: tsum.verbose = False tsum.write_message = None else: tsum.verbose = True tsum.write_message = sys.stderr.write _LOG.debug("### COUNTING SPLITS ###\n") split_distribution = splits.SplitDistribution(taxa_block=taxa) tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees) tsum.count_splits_on_trees(tree_source, split_distribution) report = [] report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths))) report.append("%d trees ignored in total." % (tree_source.total_trees_ignored)) report.append( "%d trees considered in total for split support assessment." % (tsum.total_trees_counted)) report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block)) num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered( ) report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits)) report.append( "%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits)) _LOG.debug("\n".join(report)) con_tree = treegen.star_tree(taxa) taxa_mask = taxa.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf unrooted = True n_read = float(tsum.total_trees_read) sp_list = [] for split, count in split_distribution.split_counts.iteritems(): freq = count / n_read if not splits.is_trivial_split(split, taxa_mask): m = split & taxa_mask if (m != taxa_mask) and ( (m - 1) & m ): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c - 1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m sp_list.append((freq, k, m)) else: sp_list.append((freq, m, m)) sp_list.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge curr_freq = 1.1 curr_all_splits_list = [] curr_compat_splits_list = [] all_splits_by_freq = [] compat_splits_by_freq = [] # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in sp_list: if abs(curr_freq - freq) > 0.000001: # dropping down to the next lowest freq curr_l = [freq, []] curr_all_splits_list = curr_l[1] all_splits_by_freq.append(curr_l) curr_l = [freq, []] curr_compat_splits_list = curr_l[1] compat_splits_by_freq.append(curr_l) curr_freq = freq curr_all_splits_list.append(split_to_add) if (split_to_add & root_edge.clade_mask) != split_to_add: continue lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge curr_compat_splits_list.append(split_to_add) ref_set = set() for s in ref_tree.split_edges.iterkeys(): m = s & taxa_mask if 1 & m: k = (~m) & taxa_mask else: k = m if not splits.is_trivial_split(k, taxa_mask): ref_set.add(k) all_set = set() compat_set = set() _LOG.debug("%d edges is the reference tree" % (len(ref_set))) print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD" for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq): freq = all_el[0] all_sp = all_el[1] all_set.update(all_sp) all_fn = len(ref_set - all_set) all_fp = len(all_set - ref_set) compat_sp = compat_el[1] compat_set.update(compat_sp) compat_fn = len(ref_set - compat_set) compat_fp = len(compat_set - ref_set) print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn)
add_trees_fn = sys.argv[2] if len(sys.argv) > 3: nbhd_tree_groups = [] for nbhd_tree_fn in sys.argv[3:]: nbhd_tree_f = open(nbhd_tree_fn, 'rU') nbhd_tree_groups.extend(read_add_tree_groups(nbhd_tree_f)) else: nbhd_tree_groups = None add_trees_f = open(add_trees_fn, 'rU') all_tree_groups = read_add_tree_groups(add_trees_f) taxa_block = TaxaBlock([str(i+1) for i in range(n_tax)]) taxa_blocks = [taxa_block] dataset = Dataset(taxa_blocks=taxa_blocks) #setting this > 1.0 means that more trees are retained to the neighborhood search stage score_diff_multiplier = 1.0 commands = [] if nbhd_tree_groups is None: _LOG.debug("Invocation of igarli_neighborhood.py with only one tree file -- need to set up initial neighborhood searches") for g in all_tree_groups: for el in g: newick_string = el.tree_string newick_stream = StringIO(newick_string) t = dataset.read_trees(newick_stream, format="newick")[0] encode_splits(t) el.tree = t _LOG.debug("len(g) = %d" % len(g))
conf_file = opts.conf if conf_file is None: sys.exit("Expecting a conf file template for GARLI") data_file = opts.data_filepath intree_file = opts.intree_filepath if data_file is None: sys.exit("Data file must be specified") if intree_file is None: sys.exit("Input tree file must be specified") for f in [data_file, intree_file, conf_file]: if not os.path.exists(f): sys.exit("%s does not exist" % f) conf = read_garli_conf(open(conf_file, "rU")) write_garli_conf(sys.stdout, conf) d = Dataset() d.read(open(data_file, "rU"), format="NEXUS") taxa = d.taxa_blocks[0] full_taxa_mask = taxa.all_taxa_bitmask() for n, taxon in enumerate(taxa): TAXON_TO_TRANSLATE[taxon] = str(n + 1) _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask)) assert (len(d.taxa_blocks) == 1) characters = d.char_blocks[0] assert (len(d.char_blocks) == 1) assert (len(characters) == len(taxa)) inp_trees = d.read_trees(open(intree_file, "rU"), format="NEXUS") assert (inp_trees) current_taxon_mask = None for tree in inp_trees: assert tree.taxa_block is taxa
VERBOSE = True _LOG.setLevel(logging.DEBUG) data_file = opts.data_filepath intree_file = opts.intree_filepath if data_file is None: sys.exit("Data file must be specified") if intree_file is None: sys.exit("Input tree file must be specified") for f in [data_file, intree_file, conf_file]: if not os.path.exists(f): sys.exit("%s does not exist" % f) garli = GarliConf() garli.read_garli_conf(open(conf_file, "rU")) full_dataset = Dataset() full_dataset.read(open(data_file, "rU"), format="NEXUS") assert (len(full_dataset.taxa_blocks) == 1) taxa = full_dataset.taxa_blocks[0] full_taxa_mask = taxa.all_taxa_bitmask() for n, taxon in enumerate(taxa): TAXON_TO_TRANSLATE[taxon] = str(n + 1) _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask)) garli.datafname = os.path.join("data.nex") raw_trees = full_dataset.read_trees(open(intree_file, "rU"), format="NEXUS") assert (raw_trees) current_taxon_mask = None