コード例 #1
0
    def testChangeTranslate(self):
        f = """#NEXUS
Begin taxa ;
    dimensions ntax = 4;
    taxlabels a b c d ;
end;
begin trees;
    translate 
        1 a,
        2 b,
        3 c,
        4 d;
    tree t = (1,2,(3,4));
end;
begin trees;
    translate 
        1 d,
        2 b,
        3 c,
        4 a;
    tree t = (4,2,(3,1));
end;
"""
        d = Dataset()
        d.read(StringIO(f), format="NEXUS")
        t = d.trees_blocks[0][0]
        s = d.trees_blocks[1][0]
        self.assertEqual(t.taxa_block, s.taxa_block)
        encode_splits(s)
        encode_splits(t)
        self.assertEqual(treedists.symmetric_difference(t, s), 0)
コード例 #2
0
def store_chars(char_block, format, dest=None):
    "Writes the CharacterBlock `char_block` to `dest` using writer."
    deprecation("'dataio.store_chars()' is deprecated: use 'write()' method of a Dataset object instead", logger_obj=_LOG)
    dataset = Dataset()
    dataset.add_char_block(char_block=char_block)
    return store_dataset(dataset=dataset,
        format=format,                  
        dest=dest)                  
コード例 #3
0
 def testPostOrderAgain(self):
     n = '((((t49:0.0299,t41:0.0299):0.42017,(((t39:0.231767,(t7:0.213631,(t36:0.099739,((t12:0.015337,t26:0.015337):0.036523,t8:0.05186):0.047879):0.113892):0.018137):0.062275,((t43:0.240402,(((t28:0.157872,((t1:0.132296,t46:0.132296):0.016383,(t14:0.12343,t3:0.12343):0.025249):0.009193):0.01427,((t24:0.151816,(t25:0.106007,(t32:0.035143,(t47:0.025662,t6:0.025662):0.00948):0.070864):0.045809):0.013127,t40:0.164943):0.007199):0.010417,(t34:0.159553,(((t21:0.061001,t16:0.061001):0.001043,t44:0.062044):0.011229,(t18:0.001174,t5:0.001174):0.072098):0.086281):0.023006):0.057844):0.027253,(t27:0.066806,t50:0.066806):0.20085):0.026387):0.117517,(t23:0.259881,(t4:0.072245,((t45,t30):0.029839,t31:0.029839):0.042406):0.187635):0.151679):0.038511):0.069304,(t19:0.25004,t17:0.25004):0.269335):0.215312,((((t37:0.036909,t13:0.036909):0.244651,t2:0.28156):0.262824,((t15:0.11244,(t33:0.076665,t10:0.076665):0.035775):0.124232,t9:0.236671):0.307712):0.057112,(((t22:0.065248,t42:0.065248):0.037237,t38:0.102485):0.182409,(t48:0.187654,((t20:0.03615,(t29:0.01626,t11:0.01626):0.01989):0.039894,t35:0.076044):0.11161):0.09724):0.316601):0.133191);'
     d = Dataset()
     tree = d.trees_from_string(string=n, format="NEWICK")[0]
     tree.debug_check_tree()
     used = set()
     for node in tree.postorder_node_iter():
         ch = node.child_nodes()
         for c in ch:
             self.assertTrue(c in used)
         used.add(node)
コード例 #4
0
def trees_from_newick(nl, taxa_block=None):
    """Takes an iterable list of newick strings (or files with just newick strings
    in them.
    """
    reader = get_reader(NEWICK)
    if taxa_block is not None:
        dataset = Dataset(taxa_blocks=[taxa_block])
    else:
        dataset = Dataset()
    for t in nl:
        f = t
        if isinstance(t, str):
            f = StringIO(t)
        reader.read_dataset(file_obj=f, dataset=dataset)
    return dataset
コード例 #5
0
def store_trees(trees_collection, format, dest=None):
    "Writes the list of trees `trees` to `dest` using writer."
    deprecation("'dataio.store_trees()' is deprecated: use 'write()' method of a Dataset object instead", logger_obj=_LOG)
    if isinstance(trees_collection, TreesBlock):
        trees_block = trees_collection
    else:
        trees_block = TreesBlock()
        for tree in trees_collection:
            trees_block.append(tree)
        trees_block.normalize_taxa()
    dataset = Dataset()
    dataset.add_trees_block(trees_block=trees_block)
    return store_dataset(dataset=dataset,
        format=format,                  
        dest=dest)
コード例 #6
0
 def testKTBEvolveLinearBounce(self):
     rng = DebuggingRandom()
     newick = "((t5:1611.75,t6:1611.75):3922.93,((t4:1043.81,(t2:754.11,t1:754.11):2896.9):6584.0,t3:1702.21):3832.47);"
     d = Dataset()
     tree = d.trees_from_string(string=newick, format="NEWICK")[0]
     root = tree.seed_node
     root.mutation_rate = 1e-5
     root.mean_edge_rate = root.mutation_rate
     simulate_continuous(root, rng, roeotroe=0.01, 
                         min_rate=1.0e-6, max_rate=1.0e-3, model='KTB',
                         time_attr='edge_length', val_attr='mutation_rate',
                         mean_val_attr='mean_edge_rate', 
                         constrain_rate_mode="linear_bounce")
     for i in tree.preorder_node_iter():
         if i.edge_length is not None:
             i.edge_length *= i.mean_edge_rate
コード例 #7
0
    def _write_garli_input(self, full_dataset):
        assert len(full_dataset.taxa_blocks) == 1
        taxa = full_dataset.taxa_blocks[0]

        assert (len(full_dataset.char_blocks) == 1)
        characters = full_dataset.char_blocks[0]
        assert (len(characters) == len(taxa))

        culled_taxa = TaxaBlock(taxa[:self.curr_n_taxa])
        culled_chars = copy.copy(characters)
        culled_chars.taxa_block = culled_taxa
        culled_chars.matrix = copy.copy(characters.matrix)
        culled_chars.matrix.clear()
        #culled_chars = characters.__class__(taxa_block=culled_taxa)
        #culled_chars.column_types = characters.column_types
        #culled_chars.markup_as_sequences = characters.markup_as_sequences
        template_matrix = characters.matrix
        for taxon in culled_taxa:
            culled_chars.matrix[taxon] = template_matrix[taxon]

        culled = Dataset()
        culled.taxa_blocks.append(culled_taxa)
        culled.char_blocks.append(culled_chars)

        o = open(self.datafname, "w")
        nexusWriter = nexus.NexusWriter()
        nexusWriter.write_dataset(culled, o)
        o.close()
        return culled
コード例 #8
0
 def __init__(self, sources=[], 
                    core_iterator=None, 
                    taxa_block=None, 
                    dataset=None, 
                    format=None, 
                    from_index=0, 
                    progress_func=None,
                    **kwargs):
     """An iterable collection of trees from multiple sources
         `sources` is as list of tree sources each can be either a file path (a str) 
             or a file-like object
         Either
             `core_iterator` or `dataset` must be specified as the source of the iterator
             If the `dataset` is used, the the `format` must be specified
          
         `from_index` can be used to skip the first `from_index` trees from _EACH_ file
             this is useful if you want to discard a certain number of trees from the beginning of each run
             as burnin.
            
         If `progress_func` is specified verbose messages will be sent to it for every tree processed.
     """
     if dataset is None:
         self.dataset = Dataset()
     else:
         self.dataset = dataset
     self.taxa_block = taxa_block
     self.format = format        
     if core_iterator is None:
         if self.format is None:
             raise ValueError("Either 'core_iterator' or 'format' flags must be used")
         self.using_data_it = True
     else:
         self.using_data_it = False
         self._core_iterator = core_iterator
     self.progress_func = progress_func
     self.sources = sources
     self.total_trees_read = 0
     self.total_trees_ignored = 0
     self.total_num_sources_read = 0
     self.from_index = from_index
     self.iterator_kwargs = kwargs
コード例 #9
0
 def testTaxaWithUnderscoreRead(self):
     rd = dendropy.tests.data_source_path("rana.nex")
     rt = dendropy.tests.data_source_path("rana.tre")
     d = Dataset()
     d.read(open(rd, "rU"), format="NEXUS")
     self.assertEqual(len(d.taxa_blocks[0]), 64)
     d.read_trees(open(rt, "rU"), format="NEXUS")
     self.assertEqual(len(d.taxa_blocks[0]), 64)
コード例 #10
0
    def test3Feb2009MajRuleBug(self):
        if not is_test_enabled(TestLevel.NORMAL,
                               _LOG,
                               module_name=__name__,
                               message="skipping sumtree argument tests"):
            return
        fn1 = dendropy.tests.data_source_path("maj-rule-bug1.tre")
        fn2 = dendropy.tests.data_source_path("maj-rule-bug2.tre")
        d = Dataset()

        tb1 = d.read_trees(open(fn1, "rU"),
                           format="NEXUS",
                           encode_splits=True,
                           rooted=RootingInterpretation.UNROOTED)
        tb2 = d.read_trees(open(fn2, "rU"),
                           format="NEXUS",
                           encode_splits=True,
                           rooted=RootingInterpretation.UNROOTED)
        taxa1 = d.taxa_blocks[0]
        self.assertEqual(taxa1, tb2[0].taxa_block)

        firstSD = SplitDistribution(taxa_block=taxa1)
        secondSD = SplitDistribution(taxa_block=taxa1)

        for o, t in itertools.izip(tb1, tb2):
            #encode_splits(o)
            #encode_splits(t)
            firstSD.count_splits_on_tree(o)
            secondSD.count_splits_on_tree(t)

        ts = TreeSummarizer()
        n_times = 1  # keep set to 1 except for benchmarking tree_from_splits
        for i in xrange(n_times):
            firstMR = ts.tree_from_splits(firstSD, min_freq=0.5)
            secondMR = ts.tree_from_splits(secondSD, min_freq=0.5)
        self.assertEqual(0, symmetric_difference(firstMR, secondMR))
コード例 #11
0
    def testStoreEdgeLens(self):
        n = '((((((t4:0.06759,t32:0.06759):0.198252,t39:0.265842):0.135924,((t9:0.244134,(((t23:0.014408,t49:0.014408):0.040121,t16:0.05453):0.156614,t2:0.211144):0.03299):0.013224,t34:0.257358):0.144408):0.112116,(((t45:0.110713,(t47:0.019022,t8:0.019022):0.09169):0.163042,((t1:0.168924,(((((t15:0.012356,t30:0.012356):0.000247,t18:0.012603):0.037913,t22:0.050516):0.076193,(t44:0.071301,t46:0.071301):0.055407):0.037072,(((((t50:0.00000,t29:0.00000):0.01744,t35:0.017441):0.066422,t10:0.083863):0.047231,((t6:0.012709,(t26:0.00805,t40:0.00805):0.004659):0.043941,t11:0.05665):0.074443):0.008316,t31:0.13941):0.024371):0.005144):0.025169,t33:0.194093):0.079662):0.183823,(t48:0.343218,((t41:0.032738,t27:0.032738):0.229887,((t5:0.030394,t43:0.030394):0.204863,((((t14:0.028794,t24:0.028794):0.002007,t3:0.030801):0.181488,t38:0.212289):0.017427,(t17:0.01869,t25:0.01869):0.211027):0.005541):0.027368):0.080592):0.11436):0.056304):0.078832,(t21:0.107754,t13:0.107754):0.48496):0.114273,((t36:0.352531,(((t12:0.042324,t7:0.042324):0.155519,t19:0.197843):0.016322,(t37:0.12614,t28:0.12614):0.088025):0.138366):0.147704,(t42:0.088633,t20:0.088633):0.411601):0.206753)'
        d = Dataset()
        tree = d.trees_from_string(string=n, format="NEWICK")[0]
        for nd in tree.postorder_node_iter():
            if nd is not tree.seed_node:
                if nd.edge.length is None:
                    _LOG.info("%s has edge length of None" %
                              trees.format_node(nd))
                    self.assertTrue(nd.edge.length is not None)

        ts = str(tree)
        d = Dataset()
        tree = d.trees_from_string(string=ts, format="NEWICK")[0]
        for nd in tree.postorder_node_iter():
            if nd is not tree.seed_node:
                if nd.edge.length is None:
                    _LOG.warn("%s has edge length of None" %
                              trees.format_node(nd))
                    self.assertTrue(nd.edge.length is not None)
コード例 #12
0
last_split = 1 << (n_tax - 1)
all_taxa_bitmask = (1 << n_tax) - 1

add_trees_fn = sys.argv[2]
assert len(sys.argv) > 3
nbhd_tree_groups = []
for nbhd_tree_fn in sys.argv[3:]:
    nbhd_tree_f = open(nbhd_tree_fn, 'rU')
    nbhd_tree_groups.extend(read_add_tree_groups(nbhd_tree_f))

add_trees_f = open(add_trees_fn, 'rU')
all_tree_groups = read_add_tree_groups(add_trees_f)

taxa_block = TaxaBlock([str(i + 1) for i in range(n_tax)])
taxa_blocks = [taxa_block]
dataset = Dataset(taxa_blocks=taxa_blocks)

#setting this > 1.0 means that more trees are retained to the neighborhood search stage
score_diff_multiplier = 1.0

commands = []
# first we collect all of the ParsedTree objects into all_parsed_trees and we
#   call encode_splits so that we can look up split info on each tree
all_tree_groups.extend(nbhd_tree_groups)
all_parsed_trees = []
for g in all_tree_groups:
    for el in g:
        newick_string = el.tree_string
        newick_stream = StringIO(newick_string)
        t = dataset.read_trees(newick_stream, format="newick")[0]
        encode_splits(t)
コード例 #13
0
class MultiFileTreeIterator(object):
    def __init__(self, sources=[], 
                       core_iterator=None, 
                       taxa_block=None, 
                       dataset=None, 
                       format=None, 
                       from_index=0, 
                       progress_func=None,
                       **kwargs):
        """An iterable collection of trees from multiple sources
            `sources` is as list of tree sources each can be either a file path (a str) 
                or a file-like object
            Either
                `core_iterator` or `dataset` must be specified as the source of the iterator
                If the `dataset` is used, the the `format` must be specified
             
            `from_index` can be used to skip the first `from_index` trees from _EACH_ file
                this is useful if you want to discard a certain number of trees from the beginning of each run
                as burnin.
               
            If `progress_func` is specified verbose messages will be sent to it for every tree processed.
        """
        if dataset is None:
            self.dataset = Dataset()
        else:
            self.dataset = dataset
        self.taxa_block = taxa_block
        self.format = format        
        if core_iterator is None:
            if self.format is None:
                raise ValueError("Either 'core_iterator' or 'format' flags must be used")
            self.using_data_it = True
        else:
            self.using_data_it = False
            self._core_iterator = core_iterator
        self.progress_func = progress_func
        self.sources = sources
        self.total_trees_read = 0
        self.total_trees_ignored = 0
        self.total_num_sources_read = 0
        self.from_index = from_index
        self.iterator_kwargs = kwargs

    def __iter__(self):
        si = self.from_index
        tb = self.taxa_block
        progress_func = self.progress_func
        self.curr_trees_read = 0
        self.curr_trees_ignored = 0
        self.curr_num_sources_read = 0
        for source_ind, tree_source in enumerate(self.sources):
            if isinstance(tree_source, str):
                fo = open(tree_source, "rU")
            else:
                fo = tree_source
            if progress_func:
                current_file_note = "Tree file %d of %d: " % (source_ind + 1, len(self.sources))
            self.curr_num_sources_read += 1
            self.total_num_sources_read += 1
            for n, tree in enumerate(self._raw_iter(fo, tb)):
                if (not si) or (n >= si):
                    if tb is None:
                        tb = tree.taxa_block
                    self.total_trees_read += 1
                    self.curr_trees_read += 1
                    if progress_func:
                        progress_func("%sProcessing tree %d" % (current_file_note, (n+1)))
                    yield tree
                else:
                    self.total_trees_ignored += 1
                    self.curr_trees_ignored += 1
                    if progress_func:
                        progress_func("%sSkipping tree %d (# to skip=%d)" % (current_file_note, (n+1), si))

    def _raw_iter(self, fo, tb):
        if self.using_data_it:
            for tree in self.dataset.iterate_over_trees(fo, taxa_block=tb, format=self.format, **(self.iterator_kwargs)):
                yield tree
        else:
            for tree in self._core_iterator(fo, taxa_block=tb, file_format=self.format, **(self.iterator_kwargs)):
                yield tree
コード例 #14
0
def main_cli():

    description =  '%s %s ' % (_program_name, _program_version)
    usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]"

    parser = OptionParser(usage=usage, add_help_option=True, version = _program_version, description=description)
    parser.add_option('-r','--reference',
                  dest='reference_tree_filepath',
                  default=None,
                  help="path to file containing the reference (true) tree")
    parser.add_option('-v', '--verbose',
                      action='store_false',
                      dest='quiet',
                      default=True,
                      help="Verbose mode")

    (opts, args) = parser.parse_args()

    ###################################################
    # Support file idiot checking

    sampled_filepaths = []
    missing = False
    for fpath in args:
        fpath = os.path.expanduser(os.path.expandvars(fpath))
        if not os.path.exists(fpath):
            sys.exit('Sampled trees file not found: "%s"' % fpath)
        sampled_filepaths.append(fpath)
    if not sampled_filepaths:
        sys.exit("Expecting arguments indicating files that contain sampled trees")

    sampled_file_objs = [open(f, "rU") for f in sampled_filepaths]

    ###################################################
    # Lots of other idiot-checking ...

    # target tree
    if opts.reference_tree_filepath is None:
        sys.exit("A reference tree must be specified (use -h to see all options)")
    reference_tree_filepath = os.path.expanduser(os.path.expandvars(opts.reference_tree_filepath))
    if not os.path.exists(reference_tree_filepath):
        sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath)

    d = Dataset()
    ref_trees  = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS")

    if len(ref_trees) != 1:
        sys.exit("Expecting one reference tree")
    ref_tree = ref_trees[0]
    splits.encode_splits(ref_tree)
    assert(len(d.taxa_blocks) == 1)
    taxa = d.taxa_blocks[0]


    ###################################################
    # Main work begins here: Count the splits

    start_time = datetime.datetime.now()

    comments = []
    tsum = treesum.TreeSummarizer()
    tsum.burnin = 0
    if opts.quiet:
        tsum.verbose = False
        tsum.write_message = None
    else:
        tsum.verbose = True
        tsum.write_message = sys.stderr.write




    _LOG.debug("### COUNTING SPLITS ###\n")
    split_distribution = splits.SplitDistribution(taxa_block=taxa)
    tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees)
    tsum.count_splits_on_trees(tree_source, split_distribution)

    report = []
    report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths)))
    report.append("%d trees ignored in total." % (tree_source.total_trees_ignored))
    report.append("%d trees considered in total for split support assessment." % (tsum.total_trees_counted))
    report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block))
    num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered()
    report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits))
    report.append("%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits))

    _LOG.debug("\n".join(report))


    con_tree = treegen.star_tree(taxa)
    taxa_mask = taxa.all_taxa_bitmask()
    splits.encode_splits(con_tree)
    leaves = con_tree.leaf_nodes()

    to_leaf_dict = {}
    for leaf in leaves:
        to_leaf_dict[leaf.edge.clade_mask] = leaf
    unrooted = True
    n_read = float(tsum.total_trees_read)
    sp_list = []
    for split, count in split_distribution.split_counts.iteritems():
        freq = count/n_read
        if not splits.is_trivial_split(split, taxa_mask):
            m = split & taxa_mask
            if (m != taxa_mask) and ((m-1) & m): # if not root (i.e., all "1's") and not singleton (i.e., one "1")
                if unrooted:
                    c = (~m) & taxa_mask
                    if (c-1) & c: # not singleton (i.e., one "0")
                        if 1 & m:
                            k = c
                        else:
                            k = m
                        sp_list.append((freq, k, m))
                else:
                    sp_list.append((freq, m, m))
    sp_list.sort(reverse=True)

    root = con_tree.seed_node
    root_edge = root.edge

    curr_freq = 1.1
    curr_all_splits_list = []
    curr_compat_splits_list = []
    all_splits_by_freq = []
    compat_splits_by_freq = []

    # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree
    for freq, split_to_add, split_in_dict in sp_list:
        if abs(curr_freq-freq) > 0.000001:
            # dropping down to the next lowest freq
            curr_l = [freq, []]
            curr_all_splits_list = curr_l[1]
            all_splits_by_freq.append(curr_l)
            curr_l = [freq, []]
            curr_compat_splits_list = curr_l[1]
            compat_splits_by_freq.append(curr_l)
            curr_freq = freq

        curr_all_splits_list.append(split_to_add)

        if (split_to_add & root_edge.clade_mask) != split_to_add:
            continue
        lb = splits.lowest_bit_only(split_to_add)
        one_leaf = to_leaf_dict[lb]
        parent_node = one_leaf
        while (split_to_add & parent_node.edge.clade_mask) != split_to_add:
            parent_node = parent_node.parent_node
        if parent_node is None or parent_node.edge.clade_mask == split_to_add:
            continue # split is not in tree, or already in tree.

        new_node = trees.Node()
        new_node_children = []
        new_edge = new_node.edge
        new_edge.clade_mask = 0
        for child in parent_node.child_nodes():
            # might need to modify the following if rooted splits
            # are used
            cecm = child.edge.clade_mask
            if (cecm & split_to_add ):
                assert cecm != split_to_add
                new_edge.clade_mask |= cecm
                new_node_children.append(child)
        # Check to see if we have accumulated all of the bits that we
        #   needed, but none that we don't need.
        if new_edge.clade_mask == split_to_add:
            for child in new_node_children:
                parent_node.remove_child(child)
                new_node.add_child(child)
            parent_node.add_child(new_node)
            con_tree.split_edges[split_to_add] = new_edge
            curr_compat_splits_list.append(split_to_add)
    ref_set = set()
    for s in ref_tree.split_edges.iterkeys():
        m = s & taxa_mask
        if 1 & m:
            k = (~m) & taxa_mask
        else:
            k = m
        if not splits.is_trivial_split(k, taxa_mask):
            ref_set.add(k)

    all_set = set()
    compat_set = set()

    _LOG.debug("%d edges is the reference tree" % (len(ref_set)))

    print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD"
    for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq):
        freq = all_el[0]
        all_sp = all_el[1]
        all_set.update(all_sp)
        all_fn = len(ref_set - all_set)
        all_fp = len(all_set - ref_set)
        compat_sp = compat_el[1]
        compat_set.update(compat_sp)
        compat_fn = len(ref_set - compat_set)
        compat_fp = len(compat_set - ref_set)

        print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn )
コード例 #15
0
        VERBOSE = True
        _LOG.setLevel(logging.DEBUG)
    data_file = opts.data_filepath
    intree_file = opts.intree_filepath
    if data_file is None:
        sys.exit("Data file must be specified")
    if intree_file is None:
        sys.exit("Input tree file must be specified")
    for f in [data_file, intree_file, conf_file]:
        if not os.path.exists(f):
            sys.exit("%s does not exist" % f)

    garli = GarliConf()
    garli.read_garli_conf(open(conf_file, "rU"))

    full_dataset = Dataset()
    full_dataset.read(open(data_file, "rU"), format="NEXUS")
    assert(len(full_dataset.taxa_blocks) == 1)
    taxa = full_dataset.taxa_blocks[0]
    full_taxa_mask = taxa.all_taxa_bitmask()
    for n, taxon in enumerate(taxa):
        TAXON_TO_TRANSLATE[taxon] = str(n + 1)
    _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask))


    garli.datafname = os.path.join("data.nex")

    raw_trees = full_dataset.read_trees(open(intree_file, "rU"), format="NEXUS")
    assert(raw_trees)
    current_taxon_mask = None
コード例 #16
0
 def testPHGamma(self):
     newick = "((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247);"
     d = Dataset()
     tree = d.trees_from_string(string=newick, format="NEWICK")[0]
     assert_approx_equal(pybus_harvey_gamma(tree), 0.546276)
コード例 #17
0
def main_cli():

    description = '%s %s ' % (_program_name, _program_version)
    usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]"

    parser = OptionParser(usage=usage,
                          add_help_option=True,
                          version=_program_version,
                          description=description)
    parser.add_option('-r',
                      '--reference',
                      dest='reference_tree_filepath',
                      default=None,
                      help="path to file containing the reference (true) tree")
    parser.add_option('-v',
                      '--verbose',
                      action='store_false',
                      dest='quiet',
                      default=True,
                      help="Verbose mode")

    (opts, args) = parser.parse_args()

    ###################################################
    # Support file idiot checking

    sampled_filepaths = []
    missing = False
    for fpath in args:
        fpath = os.path.expanduser(os.path.expandvars(fpath))
        if not os.path.exists(fpath):
            sys.exit('Sampled trees file not found: "%s"' % fpath)
        sampled_filepaths.append(fpath)
    if not sampled_filepaths:
        sys.exit(
            "Expecting arguments indicating files that contain sampled trees")

    sampled_file_objs = [open(f, "rU") for f in sampled_filepaths]

    ###################################################
    # Lots of other idiot-checking ...

    # target tree
    if opts.reference_tree_filepath is None:
        sys.exit(
            "A reference tree must be specified (use -h to see all options)")
    reference_tree_filepath = os.path.expanduser(
        os.path.expandvars(opts.reference_tree_filepath))
    if not os.path.exists(reference_tree_filepath):
        sys.exit('Reference tree file not found: "%s"\n' %
                 reference_tree_filepath)

    d = Dataset()
    ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'),
                             schema="NEXUS")

    if len(ref_trees) != 1:
        sys.exit("Expecting one reference tree")
    ref_tree = ref_trees[0]
    splits.encode_splits(ref_tree)
    assert (len(d.taxa_blocks) == 1)
    taxa = d.taxa_blocks[0]

    ###################################################
    # Main work begins here: Count the splits

    start_time = datetime.datetime.now()

    comments = []
    tsum = treesum.TreeSummarizer()
    tsum.burnin = 0
    if opts.quiet:
        tsum.verbose = False
        tsum.write_message = None
    else:
        tsum.verbose = True
        tsum.write_message = sys.stderr.write

    _LOG.debug("### COUNTING SPLITS ###\n")
    split_distribution = splits.SplitDistribution(taxa_block=taxa)
    tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths,
                                        core_iterator=nexus.iterate_over_trees)
    tsum.count_splits_on_trees(tree_source, split_distribution)

    report = []
    report.append("%d trees read from %d files." %
                  (tsum.total_trees_read, len(sampled_filepaths)))
    report.append("%d trees ignored in total." %
                  (tree_source.total_trees_ignored))
    report.append(
        "%d trees considered in total for split support assessment." %
        (tsum.total_trees_counted))
    report.append("%d unique taxa across all trees." %
                  len(split_distribution.taxa_block))
    num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered(
    )
    report.append("%d unique splits out of %d total splits counted." %
                  (num_unique_splits, num_splits))
    report.append(
        "%d unique non-trivial splits out of %d total non-trivial splits counted."
        % (num_nt_unique_splits, num_nt_splits))

    _LOG.debug("\n".join(report))

    con_tree = treegen.star_tree(taxa)
    taxa_mask = taxa.all_taxa_bitmask()
    splits.encode_splits(con_tree)
    leaves = con_tree.leaf_nodes()

    to_leaf_dict = {}
    for leaf in leaves:
        to_leaf_dict[leaf.edge.clade_mask] = leaf
    unrooted = True
    n_read = float(tsum.total_trees_read)
    sp_list = []
    for split, count in split_distribution.split_counts.iteritems():
        freq = count / n_read
        if not splits.is_trivial_split(split, taxa_mask):
            m = split & taxa_mask
            if (m != taxa_mask) and (
                (m - 1) & m
            ):  # if not root (i.e., all "1's") and not singleton (i.e., one "1")
                if unrooted:
                    c = (~m) & taxa_mask
                    if (c - 1) & c:  # not singleton (i.e., one "0")
                        if 1 & m:
                            k = c
                        else:
                            k = m
                        sp_list.append((freq, k, m))
                else:
                    sp_list.append((freq, m, m))
    sp_list.sort(reverse=True)

    root = con_tree.seed_node
    root_edge = root.edge

    curr_freq = 1.1
    curr_all_splits_list = []
    curr_compat_splits_list = []
    all_splits_by_freq = []
    compat_splits_by_freq = []

    # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree
    for freq, split_to_add, split_in_dict in sp_list:
        if abs(curr_freq - freq) > 0.000001:
            # dropping down to the next lowest freq
            curr_l = [freq, []]
            curr_all_splits_list = curr_l[1]
            all_splits_by_freq.append(curr_l)
            curr_l = [freq, []]
            curr_compat_splits_list = curr_l[1]
            compat_splits_by_freq.append(curr_l)
            curr_freq = freq

        curr_all_splits_list.append(split_to_add)

        if (split_to_add & root_edge.clade_mask) != split_to_add:
            continue
        lb = splits.lowest_bit_only(split_to_add)
        one_leaf = to_leaf_dict[lb]
        parent_node = one_leaf
        while (split_to_add & parent_node.edge.clade_mask) != split_to_add:
            parent_node = parent_node.parent_node
        if parent_node is None or parent_node.edge.clade_mask == split_to_add:
            continue  # split is not in tree, or already in tree.

        new_node = trees.Node()
        new_node_children = []
        new_edge = new_node.edge
        new_edge.clade_mask = 0
        for child in parent_node.child_nodes():
            # might need to modify the following if rooted splits
            # are used
            cecm = child.edge.clade_mask
            if (cecm & split_to_add):
                assert cecm != split_to_add
                new_edge.clade_mask |= cecm
                new_node_children.append(child)
        # Check to see if we have accumulated all of the bits that we
        #   needed, but none that we don't need.
        if new_edge.clade_mask == split_to_add:
            for child in new_node_children:
                parent_node.remove_child(child)
                new_node.add_child(child)
            parent_node.add_child(new_node)
            con_tree.split_edges[split_to_add] = new_edge
            curr_compat_splits_list.append(split_to_add)
    ref_set = set()
    for s in ref_tree.split_edges.iterkeys():
        m = s & taxa_mask
        if 1 & m:
            k = (~m) & taxa_mask
        else:
            k = m
        if not splits.is_trivial_split(k, taxa_mask):
            ref_set.add(k)

    all_set = set()
    compat_set = set()

    _LOG.debug("%d edges is the reference tree" % (len(ref_set)))

    print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD"
    for all_el, compat_el in itertools.izip(all_splits_by_freq,
                                            compat_splits_by_freq):
        freq = all_el[0]
        all_sp = all_el[1]
        all_set.update(all_sp)
        all_fn = len(ref_set - all_set)
        all_fp = len(all_set - ref_set)
        compat_sp = compat_el[1]
        compat_set.update(compat_sp)
        compat_fn = len(ref_set - compat_set)
        compat_fp = len(compat_set - ref_set)

        print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn,
                                              compat_fp + compat_fn, all_fp,
                                              all_fn, all_fp + all_fn)
コード例 #18
0
add_trees_fn = sys.argv[2]
if len(sys.argv) > 3:
    nbhd_tree_groups = []
    for nbhd_tree_fn in sys.argv[3:]:
        nbhd_tree_f = open(nbhd_tree_fn, 'rU')
        nbhd_tree_groups.extend(read_add_tree_groups(nbhd_tree_f))
else:
    nbhd_tree_groups = None
    
add_trees_f = open(add_trees_fn, 'rU')
all_tree_groups = read_add_tree_groups(add_trees_f)

taxa_block = TaxaBlock([str(i+1) for i in range(n_tax)])
taxa_blocks = [taxa_block]
dataset = Dataset(taxa_blocks=taxa_blocks)

#setting this > 1.0 means that more trees are retained to the neighborhood search stage
score_diff_multiplier = 1.0
    
commands = []
if nbhd_tree_groups is None:
    _LOG.debug("Invocation of igarli_neighborhood.py with only one tree file -- need to set up initial neighborhood searches") 
    for g in all_tree_groups:
        for el in g:
            newick_string = el.tree_string
            newick_stream = StringIO(newick_string)
            t = dataset.read_trees(newick_stream, format="newick")[0]
            encode_splits(t)
            el.tree = t
        _LOG.debug("len(g) = %d" % len(g))
コード例 #19
0
    conf_file = opts.conf
    if conf_file is None:
        sys.exit("Expecting a conf file template for GARLI")
    data_file = opts.data_filepath
    intree_file = opts.intree_filepath
    if data_file is None:
        sys.exit("Data file must be specified")
    if intree_file is None:
        sys.exit("Input tree file must be specified")
    for f in [data_file, intree_file, conf_file]:
        if not os.path.exists(f):
            sys.exit("%s does not exist" % f)

    conf = read_garli_conf(open(conf_file, "rU"))
    write_garli_conf(sys.stdout, conf)
    d = Dataset()
    d.read(open(data_file, "rU"), format="NEXUS")
    taxa = d.taxa_blocks[0]
    full_taxa_mask = taxa.all_taxa_bitmask()
    for n, taxon in enumerate(taxa):
        TAXON_TO_TRANSLATE[taxon] = str(n + 1)
    _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask))
    assert (len(d.taxa_blocks) == 1)
    characters = d.char_blocks[0]
    assert (len(d.char_blocks) == 1)
    assert (len(characters) == len(taxa))
    inp_trees = d.read_trees(open(intree_file, "rU"), format="NEXUS")
    assert (inp_trees)
    current_taxon_mask = None
    for tree in inp_trees:
        assert tree.taxa_block is taxa
コード例 #20
0
        VERBOSE = True
        _LOG.setLevel(logging.DEBUG)
    data_file = opts.data_filepath
    intree_file = opts.intree_filepath
    if data_file is None:
        sys.exit("Data file must be specified")
    if intree_file is None:
        sys.exit("Input tree file must be specified")
    for f in [data_file, intree_file, conf_file]:
        if not os.path.exists(f):
            sys.exit("%s does not exist" % f)

    garli = GarliConf()
    garli.read_garli_conf(open(conf_file, "rU"))

    full_dataset = Dataset()
    full_dataset.read(open(data_file, "rU"), format="NEXUS")
    assert (len(full_dataset.taxa_blocks) == 1)
    taxa = full_dataset.taxa_blocks[0]
    full_taxa_mask = taxa.all_taxa_bitmask()
    for n, taxon in enumerate(taxa):
        TAXON_TO_TRANSLATE[taxon] = str(n + 1)
    _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask))

    garli.datafname = os.path.join("data.nex")

    raw_trees = full_dataset.read_trees(open(intree_file, "rU"),
                                        format="NEXUS")
    assert (raw_trees)
    current_taxon_mask = None