def life_ott(fo):
    '''
    Makes a bogus ott for the taxonomy. Basically this gives 
    all the taxa the parent 'life' for later use in the TAG algorithm.
    '''
    output = sys.stdout
    dataset = dendropy.DataSet()
    try:
        dataset.read(stream=fo, schema="Newick")
    except DataParseError as dfe:
        raise ValueError(str(dfe))
    tree_list = dataset.tree_lists[0]
    parent_id = '805080'
    branch_counter = 0
    tree_labels = set()
    for tree in tree_list:
        encode_splits(tree)
        tree_mask = tree.seed_node.edge.split_bitmask
        assert tree_mask is not None
        tree_tax = set(split_to_list(tree_mask))
        split_list = []
        for node in tree.leaf_iter():
            tree_labels.add(node.taxon.label)
    output.write('805080\t|\t\t|\tlife\t|\tno rank\t|\tncbi:1,gbif:0\t|\t\t|\t\t|\t\t|\t\n')
    for i in tree_labels:
        name, ottoid = i.split('@')
        output.write(ottoid+'\t|\t'+parent_id+'\t|\t'+name+'\t|\tspecies\t|\tncbi:1\t|\t\t|\t\n')
Beispiel #2
0
def inplace_strict_consensus_merge(trees_to_merge, rooted=False, gordons_supertree=False):
    """Returns a tree that is the strict consensus merger of the input trees.
    """
    tree_list = list(trees_to_merge)
    del trees_to_merge[1:]
    nTrees = len(tree_list)
    _LOG.debug('%d Trees to merge:\n%s\n' % (nTrees, '\n'.join([str(i) for i in tree_list])))
    if nTrees < 2:
        return tree_list[0]
    tree_iter = iter(tree_list)
    to_modify = tree_iter.next()

    if rooted:
        raise NotImplementedError("Rooted SCM is not implemented")
    else:
        to_modify.deroot()
    encode_splits(to_modify)
    if _IS_DEBUG_LOGGING:
        assert to_modify._debug_tree_is_valid(check_splits=False)
    for to_consume in tree_iter:
        if not rooted:
            to_consume.deroot()
        encode_splits(to_consume)
        if _IS_DEBUG_LOGGING:
            assert to_consume._debug_tree_is_valid(check_splits=True)
        add_to_scm(to_modify, to_consume, rooted, gordons_supertree=gordons_supertree)
        if _IS_DEBUG_LOGGING:
            assert to_modify._debug_tree_is_valid(check_splits=False)

    return to_modify
Beispiel #3
0
    def tree_from_splits(self,
            split_distribution,
            min_freq=0.5,
            include_edge_lengths=True):
        """Returns a consensus tree from splits in `split_distribution`.

        If include_edge_length_var is True, then the sample variance of the
            edge length will also be calculated and will be stored as
            a length_var attribute.
        """
        taxon_set = split_distribution.taxon_set
        taxa_mask = taxon_set.all_taxa_bitmask()
        if self.weighted_splits:
            split_freqs = split_distribution.weighted_split_frequencies
        else:
            split_freqs = split_distribution.split_frequencies
        is_rooted = split_distribution.is_rooted
        #include_edge_lengths = self.support_as_labels and include_edge_lengths
        if self.support_as_edge_lengths and include_edge_lengths:
            raise Exception("Cannot map support as edge lengths if edge lengths are to be set on consensus tree")

        to_try_to_add = []
        _almost_one = lambda x: abs(x - 1.0) <= 0.0000001
        for s, freq in split_freqs.iteritems():
            if (min_freq is None) or (freq > min_freq) or (_almost_one(min_freq) and _almost_one(freq)):
                to_try_to_add.append((freq, s))
        to_try_to_add.sort(reverse=True)
        splits_for_tree = [i[1] for i in to_try_to_add]

        con_tree = treesplit.tree_from_splits(splits=splits_for_tree,
                taxon_set=taxon_set,
                is_rooted=is_rooted)
        treesplit.encode_splits(con_tree)

        if include_edge_lengths:
            split_edge_lengths = {}
            for split, edges in split_distribution.split_edge_lengths.items():
                if len(edges) > 0:
                    mean, var = mean_and_sample_variance(edges)
                    elen = mean
                else:
                    elen = None
                split_edge_lengths[split] = elen
        else:
            split_edge_lengths = None

        for node in con_tree.postorder_node_iter():
            split = node.edge.split_bitmask
            if split in split_freqs:
                self.map_split_support_to_node(node=node, split_support=split_freqs[split])
            if include_edge_lengths and split in split_distribution.split_edge_lengths:
                edges = split_distribution.split_edge_lengths[split]
                if len(edges) > 0:
                    mean, var = mean_and_sample_variance(edges)
                    elen = mean
                else:
                    elen = None
                node.edge.length = elen

        return con_tree
Beispiel #4
0
def false_positives_and_negatives(reference_tree, test_tree):
    """
    False pos = splits in test_tree NOT in reference_tree
    False neg = splits in reference_tree NOT in test_tree
    """
    sym_diff = 0
    false_positives = 0
    false_negatives = 0

    if reference_tree.taxon_set is not test_tree.taxon_set:
        raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \
                % (hex(id(reference_tree.taxon_set)), hex(id(test_tree.taxon_set))))
    if not hasattr(reference_tree, "split_edges"):
        treesplit.encode_splits(reference_tree)
    if not hasattr(test_tree, "split_edges"):
        treesplit.encode_splits(test_tree)
    for split in reference_tree.split_edges:
        if split in test_tree.split_edges:
            pass
        else:
            false_negatives = false_negatives + 1
            sym_diff = sym_diff + 1

    for split in test_tree.split_edges:
        if split in reference_tree.split_edges:
            pass
        else:
            false_positives = false_positives + 1
            sym_diff = sym_diff + 1

    return false_positives, false_negatives
def false_positives_and_negatives(reference_tree, test_tree):
    """
    False pos = splits in test_tree NOT in reference_tree
    False neg = splits in reference_tree NOT in test_tree
    """
    sym_diff = 0
    false_positives = 0
    false_negatives = 0

    if reference_tree.taxon_set is not test_tree.taxon_set:
        raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \
                % (hex(id(reference_tree.taxon_set)), hex(id(test_tree.taxon_set))))
    if not hasattr(reference_tree, "split_edges"):
        treesplit.encode_splits(reference_tree)
    if not hasattr(test_tree, "split_edges"):
        treesplit.encode_splits(test_tree)
    for split in reference_tree.split_edges:
        if split in test_tree.split_edges:
            pass
        else:
            false_negatives = false_negatives + 1
            sym_diff = sym_diff + 1

    for split in test_tree.split_edges:
        if split in reference_tree.split_edges:
            pass
        else:
            false_positives = false_positives + 1
            sym_diff = sym_diff + 1

    return false_positives, false_negatives
 def runTest(self):
     tree_list = dendropy.TreeList(stream=StringIO(
         """((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);
                     ((t5:2.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);
                     ((t5:0.161175,t6:0.161175):0.392293,((t2:0.075411,(t4:0.104381,t1:0.075411):1):0.065840,t3:0.170221):0.383247);
                     ((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247);
                     """),
                                   schema="newick")
     for i in tree_list:
         encode_splits(i)
     self.assertAlmostEqual(
         treecalc.euclidean_distance(tree_list[0], tree_list[1]), 2.0)
     self.assertAlmostEqual(
         treecalc.euclidean_distance(tree_list[0], tree_list[2]),
         math.sqrt(2.0))
     self.assertAlmostEqual(
         treecalc.euclidean_distance(tree_list[0], tree_list[3]),
         0.97103099999999998)
     self.assertAlmostEqual(
         treecalc.euclidean_distance(tree_list[1], tree_list[2]),
         math.sqrt(6.0))
     self.assertAlmostEqual(
         treecalc.euclidean_distance(tree_list[1], tree_list[3]),
         2.2232636377544162)
     self.assertAlmostEqual(
         treecalc.euclidean_distance(tree_list[2], tree_list[3]),
         1.000419513484718)
Beispiel #7
0
 def run(self):
     while not self.kill_received:
         try:
             source = self.work_queue.get_nowait()
         except Queue.Empty:
             break
         self.send_info("Received task: '%s'." % source, wrap=False)
         fsrc = open(source, "rU")
         for tidx, tree in enumerate(tree_source_iter(fsrc,
                 schema=self.schema,
                 taxon_set=self.taxon_set,
                 as_rooted=self.is_rooted,
                 store_tree_weights=self.weighted_trees)):
             if tidx >= self.tree_offset:
                 if (self.log_frequency == 1) or (tidx > 0 and self.log_frequency > 0 and tidx % self.log_frequency == 0):
                     self.send_info("(processing) '%s': tree at offset %d" % (source, tidx), wrap=False)
                 treesplit.encode_splits(tree)
                 self.split_distribution.count_splits_on_tree(tree)
                 if self.calc_tree_probs:
                     self.topology_counter.count(tree,
                             tree_splits_encoded=True)
             else:
                 if (self.log_frequency == 1) or (tidx > 0 and self.log_frequency > 0 and tidx % self.log_frequency == 0):
                     self.send_info("(processing) '%s': tree at offset %d (skipping)" % (source, tidx), wrap=False)
             if self.kill_received:
                 break
         if self.kill_received:
             break
         self.send_info("Completed task: '%s'." % (source), wrap=False)
     if self.kill_received:
         self.send_warning("Terminating in response to kill request.")
     else:
         self.result_split_dist_queue.put(self.split_distribution)
         self.result_topology_hash_map_queue.put(self.topology_counter.topology_hash_map)
 def runTest(self):
      ref = dendropy.Tree(stream=StringIO("((t5,t6),((t4,(t2,t1)),t3));"), schema="newick")
      taxon_set = ref.taxon_set
      encode_splits(ref)
      o_tree = dendropy.Tree(stream=StringIO("((t1,t2),((t4,(t5,t6)),t3));"), schema="newick", taxon_set=taxon_set)
      encode_splits(o_tree)
      self.assertEqual(treecalc.symmetric_difference(o_tree, ref), 2)
Beispiel #9
0
    def calc(self, tree=None, create_midpoints=None):
        """
        Calculates the distances. Note that the path length (in number of
        steps) between taxa that span the root will be off by one if
        the tree is unrooted.
        """
        if tree is not None:
            self.tree = tree
        assert self.tree is not None
        if not hasattr(self.tree, "split_edges"):
            treesplit.encode_splits(self.tree)
        self.taxon_set = self.tree.taxon_set
        self._pat_dists = {}
        self._path_steps = {}
        for i1, t1 in enumerate(self.taxon_set):
            self._pat_dists[t1] = {}
            self._path_steps[t1] = {}
            self._mrca[t1] = {}
            self.max_dist = None
            self.max_dist_taxa = None
            self.max_dist_nodes = None

        for node in self.tree.postorder_node_iter():
            children = node.child_nodes()
            if len(children) == 0:
                node.desc_paths = {node: (0, 0)}
            else:
                node.desc_paths = {}
                for cidx1, c1 in enumerate(children):
                    for desc1, (desc1_plen,
                                desc1_psteps) in c1.desc_paths.items():
                        node.desc_paths[desc1] = (desc1_plen + c1.edge.length,
                                                  desc1_psteps + 1)
                        for c2 in children[cidx1 + 1:]:
                            for desc2, (desc2_plen,
                                        desc2_psteps) in c2.desc_paths.items():
                                self._mrca[desc1.taxon][
                                    desc2.taxon] = c1.parent_node
                                pat_dist = node.desc_paths[desc1][
                                    0] + desc2_plen + c2.edge.length
                                self._pat_dists[desc1.taxon][
                                    desc2.taxon] = pat_dist
                                path_steps = node.desc_paths[desc1][
                                    1] + desc2_psteps + 1
                                self._path_steps[desc1.taxon][
                                    desc2.taxon] = path_steps
                                if pat_dist > self.max_dist:
                                    self.max_dist = pat_dist
                                    midpoint = float(pat_dist) / 2
                                    if midpoint - node.desc_paths[desc1][
                                            0] <= 0:
                                        self.max_dist_nodes = (desc1, desc2)
                                        self.max_dist_taxa = (desc1.taxon,
                                                              desc2.taxon)
                                    else:
                                        self.max_dist_nodes = (desc2, desc1)
                                        self.max_dist_taxa = (desc2.taxon,
                                                              desc1.taxon)
                    del (c1.desc_paths)
 def kernelOfTest(self, trees):
     expected = trees[-1]
     input = trees[:-1]
     _LOG.debug('input = %s' % str(input))
     output = inplace_strict_consensus_merge(input)
     encode_splits(output)
     encode_splits(expected)
     if symmetric_difference(expected, output) != 0:
         self.fail("\n%s\n!=\n%s" % (str(output), str(expected)))
Beispiel #11
0
 def kernelOfTest(self, trees):
     expected = trees[-1]
     input = trees[:-1]
     _LOG.debug('input = %s' % str(input))
     output = inplace_strict_consensus_merge(input)
     encode_splits(output)
     encode_splits(expected)
     if symmetric_difference(expected, output) != 0:
         self.fail("\n%s\n!=\n%s" % (str(output), str(expected)))
Beispiel #12
0
 def testReferenceTree(self):
     ref_tree_list = datagen.reference_tree_list()
     t_tree_list = dendropy.TreeList()
     for ref_tree in ref_tree_list:
         treesplit.encode_splits(ref_tree)
         splits = ref_tree.split_edges.keys()
         t_tree = treesplit.tree_from_splits(splits=splits,
                 taxon_set=ref_tree_list.taxon_set,
                 is_rooted=ref_tree.is_rooted)
         self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
Beispiel #13
0
 def runTest(self):
     ref = dendropy.Tree(stream=StringIO("((t5,t6),((t4,(t2,t1)),t3));"),
                         schema="newick")
     taxon_set = ref.taxon_set
     encode_splits(ref)
     o_tree = dendropy.Tree(stream=StringIO("((t1,t2),((t4,(t5,t6)),t3));"),
                            schema="newick",
                            taxon_set=taxon_set)
     encode_splits(o_tree)
     self.assertEqual(treecalc.symmetric_difference(o_tree, ref), 2)
 def testReferenceTree(self):
     ref_tree_list = datagen.reference_tree_list()
     t_tree_list = dendropy.TreeList()
     for ref_tree in ref_tree_list:
         treesplit.encode_splits(ref_tree)
         splits = ref_tree.split_edges.keys()
         t_tree = treesplit.tree_from_splits(splits=splits,
                 taxon_set=ref_tree_list.taxon_set,
                 is_rooted=ref_tree.is_rooted)
         self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
 def assertDistinctButEqualTree(self, tree1, tree2, **kwargs):
     otaxa = tree1.taxon_set
     ts = dendropy.TaxonSet()
     tree1.reindex_taxa(ts, clear=True)
     tree2.reindex_taxa(ts)
     self.assertIs(tree1.taxon_set, tree2.taxon_set)
     self.assertIsNot(tree1.taxon_set, otaxa)
     self.assertDistinctButEqual(tree1.taxon_set, otaxa, **kwargs)
     treesplit.encode_splits(tree1)
     treesplit.encode_splits(tree2)
     rfdist = treecalc.robinson_foulds_distance(tree1, tree2)
     self.assertAlmostEqual(rfdist, 0)
Beispiel #16
0
 def assertDistinctButEqualTree(self, tree1, tree2, **kwargs):
     otaxa = tree1.taxon_set
     ts = dendropy.TaxonSet()
     tree1.reindex_taxa(ts, clear=True)
     tree2.reindex_taxa(ts)
     self.assertIs(tree1.taxon_set, tree2.taxon_set)
     self.assertIsNot(tree1.taxon_set, otaxa)
     self.assertDistinctButEqual(tree1.taxon_set, otaxa, **kwargs)
     treesplit.encode_splits(tree1)
     treesplit.encode_splits(tree2)
     rfdist = treecalc.robinson_foulds_distance(tree1, tree2)
     self.assertAlmostEqual(rfdist, 0)
 def testPatDistFunc(self):
     encode_splits(self.tree)
     def _chk_distance(t1, t2, exp_distance):
         tax1 = self.tree.taxon_set.get_taxon(label=t1)
         tax2 = self.tree.taxon_set.get_taxon(label=t2)
         pd = treecalc.patristic_distance(self.tree, tax1, tax2)
         self.assertEqual(pd, exp_distance)
     _chk_distance("a", "b", 2)
     _chk_distance("a", "c", 4)
     _chk_distance("b", "c", 4)
     _chk_distance("a", "d", 6)
     _chk_distance("f", "d", 4)
     _chk_distance("c", "d", 6)
Beispiel #18
0
 def count(self, tree, tree_splits_encoded=False):
     """
     Logs/registers a tree.
     """
     if not tree_splits_encoded:
         treesplit.encode_splits(tree)
     topology = self.hash_topology(tree)
     if topology not in self.topology_hash_map:
         self.topology_hash_map[topology] = 1
     else:
         self.topology_hash_map[
             topology] = self.topology_hash_map[topology] + 1
     self.total_trees_counted += 1
Beispiel #19
0
        def countSplits(self, tc, is_rooted):
            _LOG.info(tc[0] + "; " + tc[1])
            tree_filepaths = [pathmap.tree_source_path(tc[0])]
            taxa_filepath = pathmap.tree_source_path(tc[1])
            paup_sd = paup.get_split_distribution(tree_filepaths,
                                                  taxa_filepath,
                                                  is_rooted=is_rooted,
                                                  burnin=0)
            taxon_set = paup_sd.taxon_set
            dp_sd = treesplit.SplitDistribution(taxon_set=taxon_set)
            dp_sd.ignore_edge_lengths = True
            dp_sd.ignore_node_ages = True
            dp_sd.is_rooted = is_rooted

            _LOG.debug("Taxon set: %s" % [t.label for t in taxon_set])
            taxa_mask = taxon_set.all_taxa_bitmask()
            taxon_set.lock()
            for tree_filepath in tree_filepaths:
                for tree in dataio.tree_source_iter(stream=open(
                        tree_filepath, "rU"),
                                                    schema='nexus',
                                                    taxon_set=taxon_set,
                                                    as_rooted=is_rooted):
                    self.assertIs(tree.taxon_set, dp_sd.taxon_set)
                    self.assertIs(tree.taxon_set, taxon_set)
                    treesplit.encode_splits(tree)
                    dp_sd.count_splits_on_tree(tree)

            self.assertEqual(dp_sd.total_trees_counted,
                             paup_sd.total_trees_counted)

            # SplitsDistribution counts trivial splits, whereas PAUP*
            # contree does not, so the following will not work
            #            assert len(dp_sd.splits) == len(paup_sd.splits),\
            #                 "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits))

            taxa_mask = taxon_set.all_taxa_bitmask()
            for split in dp_sd.splits:
                if not treesplit.is_trivial_split(split, taxa_mask):
                    self.assertIn(split, paup_sd.splits)
                    self.assertEqual(dp_sd.split_counts[split],
                                     paup_sd.split_counts[split])
                    paup_sd.splits.remove(split)

            # if any splits remain, they were not
            # in dp_sd or were trivial
            remaining_splits = list(paup_sd.splits)
            for split in remaining_splits:
                if treesplit.is_trivial_split(split, taxa_mask):
                    paup_sd.splits.remove(split)
            self.assertEqual(len(paup_sd.splits), 0)
Beispiel #20
0
 def count(self,
         tree,
         tree_splits_encoded=False):
     """
     Logs/registers a tree.
     """
     if not tree_splits_encoded:
         treesplit.encode_splits(tree)
     topology = self.hash_topology(tree)
     if topology not in self.topology_hash_map:
         self.topology_hash_map[topology] = 1
     else:
         self.topology_hash_map[topology] = self.topology_hash_map[topology] + 1
     self.total_trees_counted += 1
    def testUltrametricTrees(self):
        tree_files = [
            "pythonidae.beast.summary.tre", "primates.beast.mcct.medianh.tre"
        ]

        for tree_file in tree_files:
            ref_tree = dendropy.Tree.get_from_path(
                pathmap.tree_source_path(tree_file), "nexus", as_rooted=True)
            treesplit.encode_splits(ref_tree)
            splits = ref_tree.split_edges.keys()
            t_tree = treesplit.tree_from_splits(splits=splits,
                                                taxon_set=ref_tree.taxon_set,
                                                is_rooted=ref_tree.is_rooted)
            treesplit.encode_splits(t_tree)
            self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
Beispiel #22
0
    def testPatDistFunc(self):
        encode_splits(self.tree)

        def _chk_distance(t1, t2, exp_distance):
            tax1 = self.tree.taxon_set.get_taxon(label=t1)
            tax2 = self.tree.taxon_set.get_taxon(label=t2)
            pd = treecalc.patristic_distance(self.tree, tax1, tax2)
            self.assertEqual(pd, exp_distance)

        _chk_distance("a", "b", 2)
        _chk_distance("a", "c", 4)
        _chk_distance("b", "c", 4)
        _chk_distance("a", "d", 6)
        _chk_distance("f", "d", 4)
        _chk_distance("c", "d", 6)
Beispiel #23
0
def long_branch_symmdiff(trees_to_compare,
                         edge_len_threshold,
                         copy_trees=False,
                         rooted=False):
    """Returns matrix of the symmetric_differences between trees after all
    internal edges with lengths < `edge_len_threshold` have been collapsed.

    If `copy_trees` is True then the trees will be copied first (if False, then
        the trees may will have their short edges collapsed on exit).
    """
    if copy_trees:
        tree_list = [copy.copy(i) for i in trees_to_compare]
    else:
        tree_list = list(trees_to_compare)

    n_trees = len(tree_list)
    _LOG.debug('%d Trees to compare:\n%s\n' %
               (n_trees, '\n'.join([str(i) for i in tree_list])))
    if n_trees < 2:
        return [0 for t in tree_list]

    f_r = []
    for tree in tree_list:
        to_collapse = []
        encode_splits(tree)
        for edge in tree.preorder_edge_iter(filter_fn=Edge.is_internal):
            elen = edge.length
            if elen is not None and elen < edge_len_threshold:
                to_collapse.append(edge)
        for edge in to_collapse:
            collapse_edge(edge)
        f_r.append(tree.is_rooted)
        tree.is_rooted = bool(rooted)
        encode_splits(tree)

    sd_row = [0] * n_trees
    sd_mat = [list(sd_row) for i in xrange(n_trees)]
    for i, tree_one in enumerate(tree_list[:-1]):
        for col_count, tree_two in enumerate(tree_list[1 + i:]):
            j = i + 1 + col_count
            sd = symmetric_difference(tree_one, tree_two)
            sd_mat[i][j] = sd
            sd_mat[j][i] = sd

    if not copy_trees:
        for r, tree in itertools.izip(f_r, tree_list):
            tree.is_rooted = r
    return sd_mat
Beispiel #24
0
    def calc(self, tree=None, create_midpoints=None):
        """
        Calculates the distances. Note that the path length (in number of
        steps) between taxa that span the root will be off by one if
        the tree is unrooted.
        """
        if tree is not None:
            self.tree = tree
        assert self.tree is not None
        if not hasattr(self.tree, "split_edges"):
            treesplit.encode_splits(self.tree)
        self.taxon_set = self.tree.taxon_set
        self._pat_dists = {}
        self._path_steps = {}
        for i1, t1 in enumerate(self.taxon_set):
            self._pat_dists[t1] = {}
            self._path_steps[t1] = {}
            self._mrca[t1] = {}
            self.max_dist = None
            self.max_dist_taxa = None
            self.max_dist_nodes = None

        for node in self.tree.postorder_node_iter():
            children = node.child_nodes()
            if len(children) == 0:
                node.desc_paths = {node: (0, 0)}
            else:
                node.desc_paths = {}
                for cidx1, c1 in enumerate(children):
                    for desc1, (desc1_plen, desc1_psteps) in c1.desc_paths.items():
                        node.desc_paths[desc1] = (desc1_plen + c1.edge.length, desc1_psteps + 1)
                        for c2 in children[cidx1 + 1 :]:
                            for desc2, (desc2_plen, desc2_psteps) in c2.desc_paths.items():
                                self._mrca[desc1.taxon][desc2.taxon] = c1.parent_node
                                pat_dist = node.desc_paths[desc1][0] + desc2_plen + c2.edge.length
                                self._pat_dists[desc1.taxon][desc2.taxon] = pat_dist
                                path_steps = node.desc_paths[desc1][1] + desc2_psteps + 1
                                self._path_steps[desc1.taxon][desc2.taxon] = path_steps
                                if pat_dist > self.max_dist:
                                    self.max_dist = pat_dist
                                    midpoint = float(pat_dist) / 2
                                    if midpoint - node.desc_paths[desc1][0] <= 0:
                                        self.max_dist_nodes = (desc1, desc2)
                                        self.max_dist_taxa = (desc1.taxon, desc2.taxon)
                                    else:
                                        self.max_dist_nodes = (desc2, desc1)
                                        self.max_dist_taxa = (desc2.taxon, desc1.taxon)
                    del (c1.desc_paths)
Beispiel #25
0
 def runTest(self):
     n = '(Basichlsac,(Lamprothma,Mougeotisp),(((Haplomitr2,Petalaphy),((Angiopteri,(((Azollacaro,((Dennstasam,(Oleandrapi,Polypodapp)),Dicksonant)),Vittarifle),Botrychbit)),(Isoetesmel,((((Agathismac,Agathisova),Pseudotsu),(((Libocedrus,Juniperusc),Callitris),Athrotaxi)),((Liriodchi,Nelumbo),Sagittari))))),Thuidium));'
     trees = dendropy.TreeList(stream=StringIO(n + n), schema="newick")
     ref = trees[0]
     changing = trees[1]
     rng = RepeatedRandom()
     treesplit.encode_splits(ref)
     treesplit.encode_splits(changing)
     orig_root = changing.seed_node
     for i in xrange(50):
         treemanip.randomly_rotate(changing, rng=rng)
         self.assertNotEqual(str(changing), n)
         self.assertEqual(orig_root, changing.seed_node)
         changing.debug_check_tree(logger_obj=_LOG, splits=True)
         if treecalc.symmetric_difference(ref, changing) != 0:
             self.fail("\n%s\n!=\n%s" % (str(ref), str(changing)))
 def runTest(self):
      tree_list = dendropy.TreeList(
         stream=StringIO("""((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);
                     ((t5:2.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);
                     ((t5:0.161175,t6:0.161175):0.392293,((t2:0.075411,(t4:0.104381,t1:0.075411):1):0.065840,t3:0.170221):0.383247);
                     ((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247);
                     """),
         schema="newick")
      for i in tree_list:
          encode_splits(i)
      self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[0], tree_list[1]), 2.0)
      self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[0], tree_list[2]), math.sqrt(2.0))
      self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[0], tree_list[3]), 0.97103099999999998)
      self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[1], tree_list[2]), math.sqrt(6.0))
      self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[1], tree_list[3]), 2.2232636377544162)
      self.assertAlmostEqual(treecalc.euclidean_distance(tree_list[2], tree_list[3]), 1.000419513484718)
 def runTest(self):
     n = '(Basichlsac,(Lamprothma,Mougeotisp),(((Haplomitr2,Petalaphy),((Angiopteri,(((Azollacaro,((Dennstasam,(Oleandrapi,Polypodapp)),Dicksonant)),Vittarifle),Botrychbit)),(Isoetesmel,((((Agathismac,Agathisova),Pseudotsu),(((Libocedrus,Juniperusc),Callitris),Athrotaxi)),((Liriodchi,Nelumbo),Sagittari))))),Thuidium));'
     trees = dendropy.TreeList(stream=StringIO(n+n), schema="newick")
     ref = trees[0]
     changing = trees[1]
     rng = RepeatedRandom()
     treesplit.encode_splits(ref)
     treesplit.encode_splits(changing)
     orig_root = changing.seed_node
     for i in xrange(50):
         treemanip.randomly_rotate(changing, rng=rng)
         self.assertNotEqual(str(changing), n)
         self.assertEqual(orig_root, changing.seed_node)
         changing.debug_check_tree(logger_obj=_LOG, splits=True)
         if treecalc.symmetric_difference(ref, changing) != 0:
             self.fail("\n%s\n!=\n%s" % (str(ref), str(changing)))
Beispiel #28
0
 def map_split_support_to_tree(self, tree, split_distribution):
     "Maps splits support to the given tree."
     if self.weighted_splits:
         split_freqs = split_distribution.weighted_split_frequencies
     else:
         split_freqs = split_distribution.split_frequencies
     tree.reindex_taxa(taxon_set=split_distribution.taxon_set)
     assert tree.taxon_set is split_distribution.taxon_set
     treesplit.encode_splits(tree)
     for split in tree.split_edges:
         if split in split_freqs:
             split_support = split_freqs[split]
         else:
             split_support = 0.0
         self.map_split_support_to_node(tree.split_edges[split].head_node, split_support)
     return tree
Beispiel #29
0
    def calc(self, tree=None, create_midpoints=None):
        """
        Calculates the distances.
        """
        if tree is not None:
            self.tree = tree
        assert tree is not None
        if not hasattr(self.tree, "split_edges"):
            treesplit.encode_splits(self.tree)
        self.taxon_set = tree.taxon_set
        self._pat_dists = {}
        for i1, t1 in enumerate(self.taxon_set):
            self._pat_dists[t1] = {}
            self._mrca[t1] = {}
            self.max_dist = None
            self.max_dist_taxa = None
            self.max_dist_nodes = None

        for node in tree.postorder_node_iter():
            children = node.child_nodes()
            if len(children) == 0:
                node.desc_paths = {node: 0}
            else:
                node.desc_paths = {}
                for cidx1, c1 in enumerate(children):
                    for desc1, desc1_plen in c1.desc_paths.items():
                        node.desc_paths[desc1] = desc1_plen + c1.edge.length
                        for c2 in children[cidx1 + 1:]:
                            for desc2, desc2_plen in c2.desc_paths.items():
                                pat_dist = node.desc_paths[
                                    desc1] + desc2_plen + c2.edge.length
                                self._pat_dists[desc1.taxon][
                                    desc2.taxon] = pat_dist
                                self._mrca[desc1.taxon][
                                    desc2.taxon] = c1.parent_node
                                if pat_dist > self.max_dist:
                                    self.max_dist = pat_dist
                                    midpoint = float(pat_dist) / 2
                                    if midpoint - node.desc_paths[desc1] <= 0:
                                        self.max_dist_nodes = (desc1, desc2)
                                        self.max_dist_taxa = (desc1.taxon,
                                                              desc2.taxon)
                                    else:
                                        self.max_dist_nodes = (desc2, desc1)
                                        self.max_dist_taxa = (desc2.taxon,
                                                              desc1.taxon)
                    del (c1.desc_paths)
Beispiel #30
0
 def run(self):
     while not self.kill_received:
         try:
             source = self.work_queue.get_nowait()
         except Queue.Empty:
             break
         self.send_info("Received task: '%s'." % source, wrap=False)
         fsrc = open(source, "rU")
         for tidx, tree in enumerate(
                 tree_source_iter(
                     fsrc,
                     schema=self.schema,
                     taxon_set=self.taxon_set,
                     as_rooted=self.is_rooted,
                     store_tree_weights=self.weighted_trees)):
             if tidx >= self.tree_offset:
                 if (self.log_frequency
                         == 1) or (tidx > 0 and self.log_frequency > 0
                                   and tidx % self.log_frequency == 0):
                     self.send_info(
                         "(processing) '%s': tree at offset %d" %
                         (source, tidx),
                         wrap=False)
                 treesplit.encode_splits(tree)
                 self.split_distribution.count_splits_on_tree(tree)
                 if self.calc_tree_probs:
                     self.topology_counter.count(
                         tree, tree_splits_encoded=True)
             else:
                 if (self.log_frequency
                         == 1) or (tidx > 0 and self.log_frequency > 0
                                   and tidx % self.log_frequency == 0):
                     self.send_info(
                         "(processing) '%s': tree at offset %d (skipping)"
                         % (source, tidx),
                         wrap=False)
             if self.kill_received:
                 break
         if self.kill_received:
             break
         self.send_info("Completed task: '%s'." % (source), wrap=False)
     if self.kill_received:
         self.send_warning("Terminating in response to kill request.")
     else:
         self.result_split_dist_queue.put(self.split_distribution)
         self.result_topology_hash_map_queue.put(
             self.topology_counter.topology_hash_map)
Beispiel #31
0
    def testUltrametricTrees(self):
        tree_files = [
                "pythonidae.beast.summary.tre",
                "primates.beast.mcct.medianh.tre"
                ]

        for tree_file in tree_files:
            ref_tree = dendropy.Tree.get_from_path(pathmap.tree_source_path(tree_file),
                    "nexus",
                    as_rooted=True)
            treesplit.encode_splits(ref_tree)
            splits = ref_tree.split_edges.keys()
            t_tree = treesplit.tree_from_splits(splits=splits,
                    taxon_set=ref_tree.taxon_set,
                    is_rooted=ref_tree.is_rooted)
            treesplit.encode_splits(t_tree)
            self.assertEqual(ref_tree.symmetric_difference(t_tree), 0)
        def countSplits(self, tc, is_rooted):
            _LOG.info(tc[0] + "; " + tc[1])
            tree_filepaths = [pathmap.tree_source_path(tc[0])]
            taxa_filepath = pathmap.tree_source_path(tc[1])
            paup_sd = paup.get_split_distribution(tree_filepaths, taxa_filepath, is_rooted=is_rooted, burnin=0)
            taxon_set = paup_sd.taxon_set
            dp_sd = treesplit.SplitDistribution(taxon_set=taxon_set)
            dp_sd.ignore_edge_lengths = True
            dp_sd.ignore_node_ages = True
            dp_sd.is_rooted = is_rooted

            _LOG.debug("Taxon set: %s" % [t.label for t in taxon_set])
            taxa_mask = taxon_set.all_taxa_bitmask()
            taxon_set.lock()
            for tree_filepath in tree_filepaths:
                for tree in dataio.tree_source_iter(
                    stream=open(tree_filepath, "rU"), schema="nexus", taxon_set=taxon_set, as_rooted=is_rooted
                ):
                    self.assertIs(tree.taxon_set, dp_sd.taxon_set)
                    self.assertIs(tree.taxon_set, taxon_set)
                    treesplit.encode_splits(tree)
                    dp_sd.count_splits_on_tree(tree)

            self.assertEqual(dp_sd.total_trees_counted, paup_sd.total_trees_counted)

            # SplitsDistribution counts trivial splits, whereas PAUP*
            # contree does not, so the following will not work
            #            assert len(dp_sd.splits) == len(paup_sd.splits),\
            #                 "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits))

            taxa_mask = taxon_set.all_taxa_bitmask()
            for split in dp_sd.splits:
                if not treesplit.is_trivial_split(split, taxa_mask):
                    self.assertIn(split, paup_sd.splits)
                    self.assertEqual(dp_sd.split_counts[split], paup_sd.split_counts[split])
                    paup_sd.splits.remove(split)

            # if any splits remain, they were not
            # in dp_sd or were trivial
            remaining_splits = list(paup_sd.splits)
            for split in remaining_splits:
                if treesplit.is_trivial_split(split, taxa_mask):
                    paup_sd.splits.remove(split)
            self.assertEqual(len(paup_sd.splits), 0)
def long_branch_symmdiff(trees_to_compare, edge_len_threshold, copy_trees=False, rooted=False):
    """Returns matrix of the symmetric_differences between trees after all
    internal edges with lengths < `edge_len_threshold` have been collapsed.

    If `copy_trees` is True then the trees will be copied first (if False, then
        the trees may will have their short edges collapsed on exit).
    """
    if copy_trees:
        tree_list = [copy.copy(i) for i in trees_to_compare]
    else:
        tree_list = list(trees_to_compare)

    n_trees = len(tree_list)
    _LOG.debug('%d Trees to compare:\n%s\n' % (n_trees, '\n'.join([str(i) for i in tree_list])))
    if n_trees < 2:
        return [0 for t in tree_list]

    f_r = []
    for tree in tree_list:
        to_collapse = []
        encode_splits(tree)
        for edge in tree.preorder_edge_iter(filter_fn=Edge.is_internal):
            elen = edge.length
            if elen is not None and elen < edge_len_threshold:
                to_collapse.append(edge)
        for edge in to_collapse:
            collapse_edge(edge)
        f_r.append(tree.is_rooted)
        tree.is_rooted = bool(rooted)
        encode_splits(tree)

    sd_row = [0]*n_trees
    sd_mat = [list(sd_row) for i in xrange(n_trees)]
    for i, tree_one in enumerate(tree_list[:-1]):
        for col_count, tree_two in enumerate(tree_list[1+i:]):
            j = i + 1 + col_count
            sd = symmetric_difference(tree_one, tree_two)
            sd_mat[i][j] = sd
            sd_mat[j][i] = sd

    if not copy_trees:
        for r, tree in itertools.izip(f_r, tree_list):
            tree.is_rooted = r
    return sd_mat
    def calc(self, tree=None, create_midpoints=None):
        """
        Calculates the distances.
        """
        if tree is not None:
            self.tree = tree
        assert self.tree is not None
        if not hasattr(self.tree, "split_edges"):
            treesplit.encode_splits(self.tree)
        self.taxon_set = self.tree.taxon_set
        self._pat_dists = {}
        for i1, t1 in enumerate(self.taxon_set):
            self._pat_dists[t1] = {}
            self._mrca[t1] = {}
            self.max_dist = None
            self.max_dist_taxa = None
            self.max_dist_nodes = None

        for node in self.tree.postorder_node_iter():
            children = node.child_nodes()
            if len(children) == 0:
                node.desc_paths = {node : 0}
            else:
                node.desc_paths = {}
                for cidx1, c1 in enumerate(children):
                    for desc1, desc1_plen in c1.desc_paths.items():
                        node.desc_paths[desc1] = desc1_plen + c1.edge.length
                        for c2 in children[cidx1+1:]:
                            for desc2, desc2_plen in c2.desc_paths.items():
                                pat_dist = node.desc_paths[desc1] + desc2_plen + c2.edge.length
                                self._pat_dists[desc1.taxon][desc2.taxon] = pat_dist
                                self._mrca[desc1.taxon][desc2.taxon] = c1.parent_node
                                if pat_dist > self.max_dist:
                                    self.max_dist = pat_dist
                                    midpoint = float(pat_dist) / 2
                                    if midpoint - node.desc_paths[desc1] <= 0:
                                        self.max_dist_nodes = (desc1, desc2)
                                        self.max_dist_taxa = (desc1.taxon, desc2.taxon)
                                    else:
                                        self.max_dist_nodes = (desc2, desc1)
                                        self.max_dist_taxa = (desc2.taxon, desc1.taxon)
                    del(c1.desc_paths)
Beispiel #35
0
 def count_splits_on_trees(self, tree_iterator, split_distribution=None, trees_splits_encoded=False):
     """
     Given a list of trees file, a SplitsDistribution object (a new one, or,
     if passed as an argument) is returned collating the split data in the files.
     """
     if split_distribution is None:
         split_distribution = treesplit.SplitDistribution()
     taxon_set = split_distribution.taxon_set
     for tree_idx, tree in enumerate(tree_iterator):
         self.total_trees_counted += 1
         if taxon_set is None:
             assert(split_distribution.taxon_set is None)
             split_distribution.taxon_set = tree.taxon_set
             taxon_set = tree.taxon_set
         else:
             assert(taxon_set is tree.taxon_set)
         if not trees_splits_encoded:
             treesplit.encode_splits(tree)
         split_distribution.count_splits_on_tree(tree)
     return split_distribution
Beispiel #36
0
def patristic_distance(tree, taxon1, taxon2):
    """
    Given a tree with splits encoded, and two taxa on that tree, returns the
    patristic distance between the two. Much more inefficient than constructing
    a PatristicDistanceMatrix object.
    """
    if not hasattr(tree, "split_edges"):
        treesplit.encode_splits(tree)
    mrca = tree.mrca(taxa=[taxon1, taxon2])
    dist = 0
    n = tree.find_node(lambda x: x.taxon == taxon1)
    while n != mrca:
        if n.edge.length is not None:
            dist += n.edge.length
        n = n.parent_node
    n = tree.find_node(lambda x: x.taxon == taxon2)
    while n != mrca:
        if n.edge.length is not None:
            dist += n.edge.length
        n = n.parent_node
    return dist
def patristic_distance(tree, taxon1, taxon2):
    """
    Given a tree with splits encoded, and two taxa on that tree, returns the
    patristic distance between the two. Much more inefficient than constructing
    a PatristicDistanceMatrix object.
    """
    if not hasattr(tree, "split_edges"):
        treesplit.encode_splits(tree)
    mrca = tree.mrca(taxa=[taxon1, taxon2])
    dist = 0
    n = tree.find_node(lambda x: x.taxon == taxon1)
    while n != mrca:
        if n.edge.length is not None:
            dist += n.edge.length
        n = n.parent_node
    n = tree.find_node(lambda x: x.taxon == taxon2)
    while n != mrca:
        if n.edge.length is not None:
            dist += n.edge.length
        n = n.parent_node
    return dist
    def runTest(self):
        tree_list = dendropy.TreeList(stream=StringIO(
            """((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);"""
        ),
                                      schema="newick")
        for i in tree_list:
            _LOG.debug(i.get_indented_form())
            treesplit.encode_splits(i)
            _LOG.debug(i.get_indented_form(splits=True))
            i.debug_check_tree(splits=True, logger_obj=_LOG)
        root1 = tree_list[0].seed_node
        root1e = root1.edge
        self.assertEqual(treesplit.split_to_list(root1e.split_bitmask),
                         range(6))
        self.assertEqual(
            treesplit.split_to_list(root1e.split_bitmask, one_based=True),
            range(1, 7))
        self.assertEqual(
            treesplit.split_to_list(root1e.split_bitmask,
                                    mask=21,
                                    one_based=True), [1, 3, 5])
        self.assertEqual(
            treesplit.split_to_list(root1e.split_bitmask, mask=21), [0, 2, 4])
        self.assertEqual(treesplit.count_bits(root1e.split_bitmask), 6)

        fc1 = root1.child_nodes()[0]
        fc1e = fc1.edge
        self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask), [0, 1])
        self.assertEqual(
            treesplit.split_to_list(fc1e.split_bitmask, one_based=True),
            [1, 2])
        self.assertEqual(
            treesplit.split_to_list(fc1e.split_bitmask,
                                    mask=0x15,
                                    one_based=True), [1])
        self.assertEqual(
            treesplit.split_to_list(fc1e.split_bitmask, mask=0x15), [0])
        self.assertEqual(treesplit.count_bits(fc1e.split_bitmask), 2)
    def runTest(self):
        tree_list = dendropy.TreeList(
            stream=StringIO("""((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);"""),
            schema="newick")
        for i in tree_list:
            _LOG.debug(i.get_indented_form())
            treesplit.encode_splits(i)
            _LOG.debug(i.get_indented_form(splits=True))
            i.debug_check_tree(splits=True, logger_obj=_LOG)
        root1 = tree_list[0].seed_node
        root1e = root1.edge
        self.assertEqual(treesplit.split_to_list(root1e.split_bitmask), range(6))
        self.assertEqual(treesplit.split_to_list(root1e.split_bitmask, one_based=True), range(1,7))
        self.assertEqual(treesplit.split_to_list(root1e.split_bitmask, mask=21, one_based=True), [1, 3, 5])
        self.assertEqual(treesplit.split_to_list(root1e.split_bitmask, mask=21), [0, 2, 4])
        self.assertEqual(treesplit.count_bits(root1e.split_bitmask), 6)

        fc1 = root1.child_nodes()[0]
        fc1e = fc1.edge
        self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask), [0, 1])
        self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask, one_based=True), [1, 2])
        self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask, mask=0x15, one_based=True), [1])
        self.assertEqual(treesplit.split_to_list(fc1e.split_bitmask, mask=0x15), [0])
        self.assertEqual(treesplit.count_bits(fc1e.split_bitmask), 2)
Beispiel #40
0
def inplace_strict_consensus_merge(trees_to_merge,
                                   rooted=False,
                                   gordons_supertree=False):
    """Returns a tree that is the strict consensus merger of the input trees.
    """
    tree_list = list(trees_to_merge)
    del trees_to_merge[1:]
    nTrees = len(tree_list)
    _LOG.debug('%d Trees to merge:\n%s\n' %
               (nTrees, '\n'.join([str(i) for i in tree_list])))
    if nTrees < 2:
        return tree_list[0]
    tree_iter = iter(tree_list)
    to_modify = tree_iter.next()

    if rooted:
        raise NotImplementedError("Rooted SCM is not implemented")
    else:
        to_modify.deroot()
    encode_splits(to_modify)
    if _IS_DEBUG_LOGGING:
        assert to_modify._debug_tree_is_valid(check_splits=False)
    for to_consume in tree_iter:
        if not rooted:
            to_consume.deroot()
        encode_splits(to_consume)
        if _IS_DEBUG_LOGGING:
            assert to_consume._debug_tree_is_valid(check_splits=True)
        add_to_scm(to_modify,
                   to_consume,
                   rooted,
                   gordons_supertree=gordons_supertree)
        if _IS_DEBUG_LOGGING:
            assert to_modify._debug_tree_is_valid(check_splits=False)

    return to_modify
Beispiel #41
0
	taxa = dendropy.TaxonSet()
	true_tree = dendropy.Tree.get_from_path(sys.argv[1],"Newick",taxon_set = taxa) # true tree (bigtree)
	mrp_tree = dendropy.Tree.get_from_path(sys.argv[2],"Nexus",taxon_set = taxa) # MRP tree
	mrp_con = dendropy.Tree.get_from_path(sys.argv[3],"Nexus",taxon_set = taxa) # MRP tree
	sas_tree = dendropy.Tree.get_from_path(sys.argv[4],"Newick",taxon_set = taxa) # SAS tree
	to_prune = []
	for node in mrp_tree.leaf_nodes():
		if node.taxon.label == 'roottaxon':
			to_prune.append(node)
	assert(len(to_prune) == 1)
	included = set([node.taxon for node in mrp_tree.leaf_nodes()])
	#print "number of leaves",len(included)
	#print mrp_tree.leaf_nodes()
	prune_tree_to_included(sas_tree, included)
	prune_tree_to_included(true_tree, included)
	encode_splits(true_tree)
	encode_splits(mrp_tree)
	encode_splits(sas_tree)
	mrp_distance = true_tree.false_positives_and_negatives(mrp_tree)
	sas_distance = true_tree.false_positives_and_negatives(sas_tree)
	mrp_to_mrpcon = mrp_tree.false_positives_and_negatives(mrp_con)
	sas_to_mrpcon = sas_tree.false_positives_and_negatives(mrp_con)
	#print sas_tree.as_ascii_plot() 
	#print mrp_tree.as_ascii_plot()
	#print true_tree.as_ascii_plot()
	print "mrp to mrp con",mrp_to_mrpcon[0],mrp_to_mrpcon[1]
	print "sas to mrp con",sas_to_mrpcon[0],sas_to_mrpcon[1]
	print "true MRP",mrp_distance[0],mrp_distance[1]
	print "true SAS",sas_distance[0],sas_distance[1]
	mrp_to_sas = mrp_tree.false_positives_and_negatives(sas_tree)
	print "MRP SAS ",mrp_to_sas[0],mrp_to_sas[1]
Beispiel #42
0
 mrp_con = dendropy.Tree.get_from_path(sys.argv[3], "Nexus",
                                       taxon_set=taxa)  # MRP tree
 sas_tree = dendropy.Tree.get_from_path(sys.argv[4],
                                        "Newick",
                                        taxon_set=taxa)  # SAS tree
 to_prune = []
 for node in mrp_tree.leaf_nodes():
     if node.taxon.label == 'roottaxon':
         to_prune.append(node)
 assert (len(to_prune) == 1)
 included = set([node.taxon for node in mrp_tree.leaf_nodes()])
 #print "number of leaves",len(included)
 #print mrp_tree.leaf_nodes()
 prune_tree_to_included(sas_tree, included)
 prune_tree_to_included(true_tree, included)
 encode_splits(true_tree)
 encode_splits(mrp_tree)
 encode_splits(sas_tree)
 mrp_distance = true_tree.false_positives_and_negatives(mrp_tree)
 sas_distance = true_tree.false_positives_and_negatives(sas_tree)
 mrp_to_mrpcon = mrp_tree.false_positives_and_negatives(mrp_con)
 sas_to_mrpcon = sas_tree.false_positives_and_negatives(mrp_con)
 #print sas_tree.as_ascii_plot()
 #print mrp_tree.as_ascii_plot()
 #print true_tree.as_ascii_plot()
 print "mrp to mrp con", mrp_to_mrpcon[0], mrp_to_mrpcon[1]
 print "sas to mrp con", sas_to_mrpcon[0], sas_to_mrpcon[1]
 print "true MRP", mrp_distance[0], mrp_distance[1]
 print "true SAS", sas_distance[0], sas_distance[1]
 mrp_to_sas = mrp_tree.false_positives_and_negatives(sas_tree)
 print "MRP SAS ", mrp_to_sas[0], mrp_to_sas[1]
Beispiel #43
0
def add_to_scm(to_modify, to_consume, rooted=False, gordons_supertree=False):
    """Adds the tree `to_consume` to the tree `to_modify` in a strict consensus
    merge operation.  Both trees must have had encode_splits called on them."""
    assert (to_modify.taxon_set is to_consume.taxon_set)
    taxon_set = to_consume.taxon_set
    if rooted:
        raise NotImplementedError("rooted form of add_to_scm not implemented")
    to_mod_root = to_modify.seed_node
    to_mod_split = to_mod_root.edge.split_bitmask

    to_consume_root = to_consume.seed_node
    to_consume_split = to_consume_root.edge.split_bitmask

    leaf_intersection = to_mod_split & to_consume_split
    if _IS_DEBUG_LOGGING:
        _LOG.debug("add_to_scm:\n  %s\n  + %s\n%s" %
                   (str(to_modify), str(to_consume),
                    format_split(leaf_intersection, taxon_set=taxon_set)))

    n_common_leaves = count_bits(leaf_intersection)
    if n_common_leaves < 2:
        _LOG.error('trees must have at least 2 common leaves')
        raise ValueError('trees must have at least 2 common leaves')
    if n_common_leaves == 2:
        # SCM with 2 leaves in common results in a polytomy
        collapse_clade(to_mod_root)
        collapse_clade(to_consume_root)
        leaves_to_steal = [
            c for c in to_consume_root.child_nodes()
            if not (leaf_intersection & c.edge.split_bitmask)
        ]
        for leaf in leaves_to_steal:
            to_mod_root.add_child(leaf)
            to_mod_root.edge.split_bitmask |= leaf.edge.split_bitmask
        to_modify.split_edges = {
            to_mod_root.edge.split_bitmask: to_mod_root.edge
        }
        for child in to_mod_root.child_nodes():
            to_modify.split_edges[child.edge.split_bitmask] = child.edge
        return

    # at least 3 leaves in common
    tmse = to_modify.split_edges

    to_mod_relevant_splits = {}
    to_consume_relevant_splits = {}
    if not rooted:
        if _IS_DEBUG_LOGGING:
            to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG)
            to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG)

        reroot_on_lowest_common_index_path(to_modify, leaf_intersection)
        reroot_on_lowest_common_index_path(to_consume, leaf_intersection)

        if _IS_DEBUG_LOGGING:
            to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG)
            to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG)

        to_mod_root = to_modify.seed_node
        assert (to_mod_root.edge.split_bitmask == to_mod_split)
        to_consume_root = to_consume.seed_node
        assert (to_consume_root.edge.split_bitmask == to_consume_split)

    for s, e in tmse.iteritems():
        s = e.split_bitmask
        masked = s & leaf_intersection
        if masked and masked != leaf_intersection:
            e_list = to_mod_relevant_splits.setdefault(masked, [])
            e_list.append((s, e))

    for s, e in to_consume.split_edges.iteritems():
        s = e.split_bitmask
        masked = s & leaf_intersection
        if masked and masked != leaf_intersection:
            e_list = to_consume_relevant_splits.setdefault(masked, [])
            e_list.append((s, e))

    # Because each of these paths radiates away from the root (none of the paths
    #   cross the root), the split_bitmasks for deeper edges will be supersets
    #   of the split_bitmasks for shallower nodes.  Thus if we reverse sort we
    #   get the edges in the order root->tip
    for split, path in to_mod_relevant_splits.iteritems():
        path.sort(reverse=True)
        t = [i[1] for i in path]
        del path[:]
        path.extend(t)
    for split, path in to_consume_relevant_splits.iteritems():
        path.sort(reverse=True)
        t = [i[1] for i in path]
        del path[:]
        path.extend(t)
    if _IS_DEBUG_LOGGING:
        to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG)
        to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG)

    # first we'll collapse all paths in the common leafset in to_modify that
    #   are not in to_consume
    _collapse_paths_not_found(to_mod_relevant_splits,
                              to_consume_relevant_splits, tmse)
    # Now we'll collapse all paths in the common leafset in to_consume that
    #   are not in to_modify
    _collapse_paths_not_found(to_consume_relevant_splits,
                              to_mod_relevant_splits)

    # first we'll deal with subtrees that are:
    #       - not in the leaf intersection set, and
    #       - attached to "relevant" nodes
    # We simply move these subtrees from the to_consume tree to the appropriate
    #   node in to_modify
    to_steal = [
        i for i in to_consume_root.child_nodes()
        if (i.edge.split_bitmask & leaf_intersection) == 0
    ]
    for child in to_steal:
        to_mod_root.add_child(child)
        to_mod_root.edge.split_bitmask |= child.edge.split_bitmask

    for masked_split, to_consume_path in to_consume_relevant_splits.iteritems(
    ):
        to_mod_path = to_mod_relevant_splits.get(masked_split)
        if _IS_DEBUG_LOGGING and to_mod_path is None:  #to_mod_path is None:
            _LOG.debug("%s = mask" %
                       format_split(leaf_intersection, taxon_set=taxon_set))
            _LOG.debug("%s = masked" %
                       format_split(masked_split, taxon_set=taxon_set))
            _LOG.debug("%s = raw" % format_split(
                to_consume_path[-1].split_bitmask, taxon_set=taxon_set))
            for k, v in to_mod_relevant_splits.iteritems():
                _LOG.debug("%s in to_mod_relevant_splits" %
                           format_split(k, taxon_set=taxon_set))

        assert to_mod_path is not None
        to_mod_head = to_mod_path[-1].head_node
        to_mod_head_edge = to_mod_head.edge
        to_consume_head = to_consume_path[-1].head_node
        for child in to_consume_head.child_nodes():
            if (child.edge.split_bitmask & leaf_intersection) == 0:
                # child is the root of a subtree that has no children in the leaf_intersection
                to_mod_head.add_child(child)
                to_mod_head_edge.split_bitmask |= child.edge.split_bitmask
        if len(to_consume_path) > 1:
            if len(to_mod_path) > 1:
                # collision
                if gordons_supertree:
                    for edge in to_mod_path[2:]:
                        p = edge.tail_node
                        c = edge.head_node
                        sibs = p.child_nodes()
                        for sib in sibs:
                            _LOG.debug("sib is %s" % (sib.compose_newick()))
                            if sib is not c:
                                if not sib.is_leaf():
                                    collapse_clade(sib)
                                    collapse_edge(sib.edge)
                        collapse_edge(p.edge)
                    mid_node = to_mod_path[0].head_node
                    for edge in to_consume_path[1:]:
                        p = edge.tail_node
                        avoid = edge.head_node
                        for child in p.child_nodes():
                            _LOG.debug("child is %s" %
                                       (child.compose_newick()))
                            if child is not avoid:
                                mid_node.add_child(child)
                                collapse_clade(child)
                                if not child.is_leaf():
                                    collapse_edge(child.edge)
                                mid_node.edge.split_bitmask |= child.edge.split_bitmask
                else:
                    for edge in to_mod_path[1:-1]:
                        collapse_edge(edge)
                    mid_node = to_mod_path[0].head_node
                    for edge in to_consume_path[1:]:
                        p = edge.tail_node
                        avoid = edge.head_node
                        for child in p.child_nodes():
                            if child is not avoid:
                                mid_node.add_child(child)
                                mid_node.edge.split_bitmask |= child.edge.split_bitmask
            else:
                # we have to move the subtrees from to_consume to to_modify
                to_mod_edge = to_mod_path[0]
                to_mod_tail, to_mod_head = to_mod_edge.tail_node, to_mod_edge.head_node
                deepest_edge_to_move = to_consume_path[0]
                deepest_node_to_move = deepest_edge_to_move.head_node
                tipmost_edge_to_move = to_consume_path[-1]
                tipmost_node_to_move = tipmost_edge_to_move.tail_node
                prev_head = tipmost_edge_to_move.head_node

                to_mod_tail.add_child(deepest_node_to_move)
                to_mod_tail.remove_child(to_mod_head)
                tipmost_node_to_move.add_child(to_mod_head)
                tipmost_node_to_move.remove_child(prev_head)
    encode_splits(to_modify)
 dataset = DataSet()
 try:
     dataset.read(stream=fo, schema="Newick")
 except DataParseError as dfe:
     raise ValueError(str(dfe))
 if len(dataset.taxon_sets) != 1:
     raise ValueError("Expecting one set of taxa in %s" % f)
 if len(dataset.tree_lists) != 1:
     raise ValueError("Expecting one tree in %s" % f)
 taxon_set = dataset.taxon_sets[0]
 tree_list = dataset.tree_lists[0]
 number_of_taxon = len(taxon_set)
 branch_counter = 0
 code_list = [StringIO() for i in taxon_set]
 for tree in tree_list:
     encode_splits(tree)
     tree_mask = tree.seed_node.edge.split_bitmask
     assert tree_mask is not None
     tree_tax = set(split_to_list(tree_mask))
     #print tree_tax
     split_list = []
     for node in tree.postorder_internal_node_iter():
         if node.parent_node is not None:
             branch_counter +=1
             split_set = set(split_to_list(node.edge.split_bitmask))
             split_list.append(split_set)
     for i,stream in enumerate(code_list):
         if i in tree_tax:
             for split in split_list:
                 if i in split:
                     stream.write('1')
Beispiel #45
0
    def runTest(self):

        taxon_set = dendropy.TaxonSet([str(i + 1) for i in range(5)])
        tree_list = dendropy.TreeList(stream=StringIO("""
            (5,((4,3),2),1);
            (5,(4,3,2),1);
            (5,((4,3),2),1);
            (5,(4,3),2,1);
            (5,((4,3),2),1);
            (5,4,3,2,1);
            """),
                                      schema="newick",
                                      taxon_set=taxon_set)
        tree = tree_list[0]
        expected_tree = tree_list[1]
        treesplit.encode_splits(tree)
        all_cm = tree.seed_node.edge.split_bitmask
        split_to_target = 0xA
        treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm)
        treesplit.encode_splits(tree)
        treesplit.encode_splits(expected_tree)
        self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)

        tree = tree_list[2]
        expected_tree = tree_list[3]
        treesplit.encode_splits(tree)
        all_cm = tree.seed_node.edge.split_bitmask
        split_to_target = 0x3
        treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm)
        treesplit.encode_splits(tree)
        treesplit.encode_splits(expected_tree)
        self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)

        tree = tree_list[4]
        expected_tree = tree_list[5]
        treesplit.encode_splits(tree)
        all_cm = tree.seed_node.edge.split_bitmask
        split_to_target = 0x5
        treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm)
        treesplit.encode_splits(tree)
        treesplit.encode_splits(expected_tree)
        self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)
def get_length_diffs(tree1,
        tree2,
        edge_length_attr="length",
        value_type=float,
        split_length_diff_map=False):
    """
    Returns a list of tuples, with the first element of each tuple representing
    the length of the branch subtending a particular split on ``tree1``, and
    the second element the length of the same branch on ``tree2``. If a
    particular split is found on one tree but not in the other, a value of zero
    is used for the missing split.
    """
    length_diffs = []
    split_length_diffs = {}
    if tree1.taxon_set is not tree2.taxon_set:
        raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \
                % (hex(id(tree1.taxon_set)), hex(id(tree2.taxon_set))))
    if not hasattr(tree1, "split_edges"):
        treesplit.encode_splits(tree1)
    if not hasattr(tree2, "split_edges"):
        treesplit.encode_splits(tree2)
    split_edges2_copy = dict(tree2.split_edges) # O(n*(2*bind + dict_item_cost))
    split_edges1_ref = tree1.split_edges
    for split, edge in split_edges1_ref.iteritems(): # O n : 2*bind
        elen1 = getattr(edge, edge_length_attr) # attr + bind
        if elen1 is None:
            elen1 = 0 # worst-case: bind
        value1 = value_type(elen1) #  ctor + bind
        try:
            e2 = split_edges2_copy.pop(split) # attr + dict_lookup + bind
            elen2 = getattr(e2, edge_length_attr) # attr + bind
            if elen2 is None:
                # allow root edge to have split with no value: raise error if not root edge
                if e2.tail_node is None:
                    elen2 = 0.0
                else:
                    raise ValueError("Edge length attribute is 'None': Tree: %s ('%s'), Split: %s" % (tree2.oid, tree2.label, tree2.taxon_set.split_as_newick_string(split)))
        except KeyError: # excep
            elen2 = 0.0
        value2 = value_type(elen2) #  ctor + bind # best case
        length_diffs.append((value1,value2)) # ctor + listappend
        split_length_diffs[split] = length_diffs[-1]

    for split, edge in split_edges2_copy.iteritems(): # best-case not executed, worst case O(n) : 2*bind
        elen2 = getattr(edge, edge_length_attr) # attr +  bind
        if elen2 is None:
            elen2 = 0
        value2 = value_type(elen2) #  ctor + bind
        e1 = split_edges1_ref.get(split) # attr + dict_lookup + bind
        if e1 is None:
            elen1 = 0.0
        else:
            elen1 = getattr(e1, edge_length_attr) # attr  + bind
            if elen1 is None:
                # allow root edge to have split with no value: raise error if not root edge
                if e1.tail_node is None:
                    elen1 = 0.0
                else:
                    raise ValueError("Edge length attribute is 'None': Tree: %s ('%s'), Split: %s" % (tree1.oid, tree1.label, split))
                #elen1 = 0
        value1 = value_type(elen1)
        length_diffs.append((value1,value2)) # ctor + listappend
        split_length_diffs[split] = length_diffs[-1]
    # the numbers below do not reflect additions to the code to protect against
    #   edges with length None
    # loops
    #  best-case:
    #   O(n * (dict_lookup + 3*attr + 3*ctor + 7*bind + listappend))
    #  worst-case:
    #     separated: O(n * (2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + excep) + n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend))
    #   or:
    #     O(2n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + 0.5*excep))

    # total
    #  best-case:
    #       O(n * (dict_lookup + 3*attr + 3*ctor + 8*bind + listappend + dict_item_cost))
    #  worst-case:
    #     O(2n*(2*dict_lookup + 4*attr + 3*ctor + 9*bind + listappend + 0.5*(dict_item_cost + excep))
    if split_length_diff_map:
        return length_diffs, split_length_diffs
    else:
        return length_diffs
    def runTest(self):

        taxon_set = dendropy.TaxonSet([str(i+1) for i in range(5)])
        tree_list = dendropy.TreeList(
            stream=StringIO("""
            (5,((4,3),2),1);
            (5,(4,3,2),1);
            (5,((4,3),2),1);
            (5,(4,3),2,1);
            (5,((4,3),2),1);
            (5,4,3,2,1);
            """),
            schema="newick",
            taxon_set=taxon_set)
        tree = tree_list[0]
        expected_tree = tree_list[1]
        treesplit.encode_splits(tree)
        all_cm = tree.seed_node.edge.split_bitmask
        split_to_target = 0xA
        treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm)
        treesplit.encode_splits(tree)
        treesplit.encode_splits(expected_tree)
        self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)

        tree = tree_list[2]
        expected_tree = tree_list[3]
        treesplit.encode_splits(tree)
        all_cm = tree.seed_node.edge.split_bitmask
        split_to_target = 0x3
        treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm)
        treesplit.encode_splits(tree)
        treesplit.encode_splits(expected_tree)
        self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)

        tree = tree_list[4]
        expected_tree = tree_list[5]
        treesplit.encode_splits(tree)
        all_cm = tree.seed_node.edge.split_bitmask
        split_to_target = 0x5
        treemanip.collapse_conflicting(tree.seed_node, split_to_target, all_cm)
        treesplit.encode_splits(tree)
        treesplit.encode_splits(expected_tree)
        self.assertEqual(treecalc.symmetric_difference(tree, expected_tree), 0)
Beispiel #48
0
def add_to_scm(to_modify, to_consume, rooted=False, gordons_supertree=False):
    """Adds the tree `to_consume` to the tree `to_modify` in a strict consensus
    merge operation.  Both trees must have had encode_splits called on them."""
    assert(to_modify.taxon_set is to_consume.taxon_set)
    taxon_set = to_consume.taxon_set
    if rooted:
        raise NotImplementedError("rooted form of add_to_scm not implemented")
    to_mod_root = to_modify.seed_node
    to_mod_split = to_mod_root.edge.split_bitmask

    to_consume_root = to_consume.seed_node
    to_consume_split = to_consume_root.edge.split_bitmask

    leaf_intersection = to_mod_split & to_consume_split
    if _IS_DEBUG_LOGGING:
        _LOG.debug("add_to_scm:\n  %s\n  + %s\n%s" % (str(to_modify), str(to_consume), format_split(leaf_intersection, taxon_set=taxon_set)))

    n_common_leaves = count_bits(leaf_intersection)
    if n_common_leaves < 2:
        _LOG.error('trees must have at least 2 common leaves')
        raise ValueError('trees must have at least 2 common leaves')
    if n_common_leaves == 2:
        # SCM with 2 leaves in common results in a polytomy
        collapse_clade(to_mod_root)
        collapse_clade(to_consume_root)
        leaves_to_steal = [c for c in to_consume_root.child_nodes() if not (leaf_intersection & c.edge.split_bitmask)]
        for leaf in leaves_to_steal:
            to_mod_root.add_child(leaf)
            to_mod_root.edge.split_bitmask |= leaf.edge.split_bitmask
        to_modify.split_edges = {to_mod_root.edge.split_bitmask : to_mod_root.edge}
        for child in to_mod_root.child_nodes():
            to_modify.split_edges[child.edge.split_bitmask] = child.edge
        return

    # at least 3 leaves in common
    tmse = to_modify.split_edges

    to_mod_relevant_splits = {}
    to_consume_relevant_splits = {}
    if not rooted:
        if _IS_DEBUG_LOGGING:
            to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG)
            to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG)

        reroot_on_lowest_common_index_path(to_modify, leaf_intersection)
        reroot_on_lowest_common_index_path(to_consume, leaf_intersection)

        if _IS_DEBUG_LOGGING:
            to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG)
            to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG)

        to_mod_root = to_modify.seed_node
        assert(to_mod_root.edge.split_bitmask == to_mod_split)
        to_consume_root = to_consume.seed_node
        assert(to_consume_root.edge.split_bitmask == to_consume_split)

    for s, e in tmse.iteritems():
        s = e.split_bitmask
        masked = s & leaf_intersection
        if masked and masked != leaf_intersection:
            e_list = to_mod_relevant_splits.setdefault(masked, [])
            e_list.append((s, e))

    for s, e in to_consume.split_edges.iteritems():
        s = e.split_bitmask
        masked = s & leaf_intersection
        if masked and masked != leaf_intersection:
            e_list = to_consume_relevant_splits.setdefault(masked, [])
            e_list.append((s, e))

    # Because each of these paths radiates away from the root (none of the paths
    #   cross the root), the split_bitmasks for deeper edges will be supersets
    #   of the split_bitmasks for shallower nodes.  Thus if we reverse sort we
    #   get the edges in the order root->tip
    for split, path in to_mod_relevant_splits.iteritems():
        path.sort(reverse=True)
        t = [i[1] for i in path]
        del path[:]
        path.extend(t)
    for split, path in to_consume_relevant_splits.iteritems():
        path.sort(reverse=True)
        t = [i[1] for i in path]
        del path[:]
        path.extend(t)
    if _IS_DEBUG_LOGGING:
        to_modify.debug_check_tree(check_splits=True, logger_obj=_LOG)
        to_consume.debug_check_tree(check_splits=True, logger_obj=_LOG)


    # first we'll collapse all paths in the common leafset in to_modify that
    #   are not in to_consume
    _collapse_paths_not_found(to_mod_relevant_splits, to_consume_relevant_splits, tmse)
    # Now we'll collapse all paths in the common leafset in to_consume that
    #   are not in to_modify
    _collapse_paths_not_found(to_consume_relevant_splits, to_mod_relevant_splits)


    # first we'll deal with subtrees that are:
    #       - not in the leaf intersection set, and
    #       - attached to "relevant" nodes
    # We simply move these subtrees from the to_consume tree to the appropriate
    #   node in to_modify
    to_steal = [i for i in to_consume_root.child_nodes() if (i.edge.split_bitmask & leaf_intersection) == 0]
    for child in to_steal:
        to_mod_root.add_child(child)
        to_mod_root.edge.split_bitmask |= child.edge.split_bitmask

    for masked_split, to_consume_path in to_consume_relevant_splits.iteritems():
        to_mod_path = to_mod_relevant_splits.get(masked_split)
        if _IS_DEBUG_LOGGING and to_mod_path is None: #to_mod_path is None:
            _LOG.debug("%s = mask" % format_split(leaf_intersection, taxon_set=taxon_set))
            _LOG.debug("%s = masked" % format_split(masked_split, taxon_set=taxon_set))
            _LOG.debug("%s = raw" % format_split(to_consume_path[-1].split_bitmask, taxon_set=taxon_set))
            for k, v in to_mod_relevant_splits.iteritems():
                _LOG.debug("%s in to_mod_relevant_splits" % format_split(k, taxon_set=taxon_set))

        assert to_mod_path is not None
        to_mod_head = to_mod_path[-1].head_node
        to_mod_head_edge = to_mod_head.edge
        to_consume_head = to_consume_path[-1].head_node
        for child in to_consume_head.child_nodes():
            if (child.edge.split_bitmask & leaf_intersection) == 0:
                # child is the root of a subtree that has no children in the leaf_intersection
                to_mod_head.add_child(child)
                to_mod_head_edge.split_bitmask |= child.edge.split_bitmask
        if len(to_consume_path) > 1:
            if len(to_mod_path) > 1:
                # collision
                if gordons_supertree:
                    for edge in to_mod_path[2:]:
                        p = edge.tail_node
                        c = edge.head_node
                        sibs = p.child_nodes()
                        for sib in sibs:
                            _LOG.debug("sib is %s" % (sib.compose_newick()))
                            if sib is not c:
                                if not sib.is_leaf():
                                    collapse_clade(sib)
                                    collapse_edge(sib.edge)
                        collapse_edge(p.edge)
                    mid_node = to_mod_path[0].head_node
                    for edge in to_consume_path[1:]:
                        p = edge.tail_node
                        avoid = edge.head_node
                        for child in p.child_nodes():
                            _LOG.debug("child is %s" % (child.compose_newick()))
                            if child is not avoid:
                                mid_node.add_child(child)
                                collapse_clade(child)
                                if not child.is_leaf():
                                    collapse_edge(child.edge)
                                mid_node.edge.split_bitmask |= child.edge.split_bitmask
                else:
                    for edge in to_mod_path[1:-1]:
                        collapse_edge(edge)
                    mid_node = to_mod_path[0].head_node
                    for edge in to_consume_path[1:]:
                        p = edge.tail_node
                        avoid = edge.head_node
                        for child in p.child_nodes():
                            if child is not avoid:
                                mid_node.add_child(child)
                                mid_node.edge.split_bitmask |= child.edge.split_bitmask
            else:
                # we have to move the subtrees from to_consume to to_modify
                to_mod_edge = to_mod_path[0]
                to_mod_tail, to_mod_head = to_mod_edge.tail_node, to_mod_edge.head_node
                deepest_edge_to_move = to_consume_path[0]
                deepest_node_to_move = deepest_edge_to_move.head_node
                tipmost_edge_to_move = to_consume_path[-1]
                tipmost_node_to_move = tipmost_edge_to_move.tail_node
                prev_head = tipmost_edge_to_move.head_node

                to_mod_tail.add_child(deepest_node_to_move)
                to_mod_tail.remove_child(to_mod_head)
                tipmost_node_to_move.add_child(to_mod_head)
                tipmost_node_to_move.remove_child(prev_head)
    encode_splits(to_modify)
Beispiel #49
0
def process_sources_serial(support_filepaths, schema, is_rooted,
                           ignore_node_ages, calc_tree_probs, weighted_trees,
                           tree_offset, log_frequency, messenger):
    """
    Returns a SplitDistribution object summarizing all trees found in
    `support_filepaths`.
    """
    messenger.send_info("Running in serial mode.")
    taxon_set = dendropy.TaxonSet()
    split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set)
    split_distribution.ignore_node_ages = ignore_node_ages
    split_distribution.is_rooted = is_rooted
    topology_counter = treesum.TopologyCounter()

    if support_filepaths is None or len(support_filepaths) == 0:
        messenger.send_info("Reading trees from standard input.")
        srcs = [sys.stdin]
    else:
        messenger.send_info("%d source(s) to be processed." %
                            len(support_filepaths))

        # do not want to have all files open at the same time
        #srcs = [open(f, "rU") for f in support_filepaths]

        # store filepaths, to open individually in loop
        srcs = support_filepaths

    for sidx, src in enumerate(srcs):

        # hack needed because we do not want to open all input files at the
        # same time; if not a file object, assume it is a file path and create
        # corresponding file object
        if not isinstance(src, file):
            src = open(src, "rU")

        name = getattr(src, "name", "<stdin>")
        messenger.send_info("Processing %d of %d: '%s'" %
                            (sidx + 1, len(srcs), name),
                            wrap=False)
        for tidx, tree in enumerate(
                tree_source_iter(src,
                                 schema=schema,
                                 taxon_set=taxon_set,
                                 store_tree_weights=weighted_trees,
                                 as_rooted=is_rooted)):
            if tidx >= tree_offset:
                if (log_frequency == 1) or (tidx > 0 and log_frequency > 0
                                            and tidx % log_frequency == 0):
                    messenger.send_info(
                        "(processing) '%s': tree at offset %d" % (name, tidx),
                        wrap=False)
                treesplit.encode_splits(tree)
                split_distribution.count_splits_on_tree(tree)
                topology_counter.count(tree, tree_splits_encoded=True)
            else:
                if (log_frequency == 1) or (tidx > 0 and log_frequency > 0
                                            and tidx % log_frequency == 0):
                    messenger.send_info(
                        "(processing) '%s': tree at offset %d (skipping)" %
                        (name, tidx),
                        wrap=False)
        try:
            src.close()
        except ValueError:
            # "I/O operation on closed file" if we try to close sys.stdin
            pass

    messenger.send_info("Serial processing of %d source(s) completed." %
                        len(srcs))
    return split_distribution, topology_counter
def do_sim(birth_rate   , death_rate, num_leaves, rng=None):
    temp_dir = tempfile.mkdtemp()
    model_tree = treesim.birth_death(birth_rate=birth_rate,
                            death_rate=death_rate,
                            ntax=num_leaves,
                            rng=rng)
    ################################################################################
    # Calling seq-gen
    mtf = os.path.join(temp_dir, 'simtree')
    print "temp_dir =", temp_dir
    treefile_obj = open(mtf, 'w')
    treefile_obj.write("%s;\n" % str(model_tree))
    # CLOSING THE FILE IS IMPORTANT!  This flushes buffers, assuring that the data
    #  will be written to the filesystem before seq-gen is invoked.
    treefile_obj.close() 
    
    
    import subprocess
    command_line = ['seq-gen',
                    '-mHKY',
                    '-on',
                ]
    if os.environ.get('TREE_INF_TEST_RAND_NUMBER_SEED'):
        sg_seed = seed
        
    else:
        if rng is None:
            sg_seed = random.randint(0,100000)
        else:
            sg_seed = rng.randint(0,100000)
    command_line.append('-z%d' % sg_seed)
    command_line.append('simtree')
    
    seq_gen_proc = subprocess.Popen(command_line,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    cwd=temp_dir)
    
    dataset = seq_gen_proc.communicate()[0]
    
    
    # seq-gen does not exit with an error code when it fails.  I don't know why!!
    if seq_gen_proc.returncode != 0 or len(dataset) == 0:
        sys.exit('seq-gen failed!\n')
    sd = os.path.join(temp_dir, 'simdata.nex')
    d = open(sd, 'w')
    d.write(dataset)
    # CLOSING THE FILE IS IMPORTANT!  This flushes buffers, assuring that the data
    #  will be written to the filesystem before PAUP is invoked.
    d.close()
    
    ################################################################################
    # PAUP
    pcf = os.path.join(temp_dir, 'execute_paup.nex')
    pc = open(pcf, 'w')
    pc.write('''execute simdata.nex ; 
    hsearch nomultrees ; 
    savetree file=inferred.tre format = NEXUS;
    quit;
    ''')
    pc.close()
    paup_proc = subprocess.Popen(['paup', '-n', pcf], 
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 cwd=temp_dir)
    (o, e) = paup_proc.communicate()
    
    paup_output = os.path.join(temp_dir, 'inferred.tre')
    # seq-gen does not exit with an error code when it fails.  I don't know why!!
    if paup_proc.returncode != 0 or not os.path.exists(paup_output):
        sys.exit(e)
    
    
    # read true tree with the inferred tree (because it is nexus)
    inf_tree_list = TreeList.get_from_path(paup_output, 
                                           "NEXUS",
                                           taxon_set=model_tree.taxon_set)
    assert len(inf_tree_list) == 1
    inferred_tree = inf_tree_list[0]
    
    # determine which splits were missed
    treesplit.encode_splits(inferred_tree)
    treesplit.encode_splits(model_tree)
    missing = model_tree.find_missing_splits(inferred_tree)
    # sort the nodes of the true tree by depth and ask whether or not they were recovered
    node_depth_TF_list = []
    for node in model_tree.postorder_node_iter():
        children = node.child_nodes()
        if children and node.parent_node:
            first_child = children[0]
            node.depth = first_child.depth + first_child.edge.length
            if node.edge.split_bitmask in missing:
                recovered = 0
            else:
                recovered = 1
            node_depth_TF_list.append((node.depth, node.edge.length, recovered))
        else:
            node.depth = 0.0
    
    node_depth_TF_list.sort()
    
    os.remove(pcf)
    os.remove(paup_output)
    os.remove(sd)
    os.remove(mtf)
    os.rmdir(temp_dir)
    
    return node_depth_TF_list
Beispiel #51
0
def process_sources_serial(
    support_filepaths,
    schema,
    is_rooted,
    ignore_node_ages,
    ultrametricity_precision,
    calc_tree_probs,
    weighted_trees,
    tree_offset,
    log_frequency,
    messenger,
):
    """
    Returns a SplitDistribution object summarizing all trees found in
    `support_filepaths`.
    """
    messenger.send_info("Running in serial mode.")
    taxon_set = dendropy.TaxonSet()
    split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set)
    split_distribution.ignore_node_ages = ignore_node_ages
    split_distribution.is_rooted = is_rooted
    split_distribution.ultrametricity_precision = ultrametricity_precision
    topology_counter = treesum.TopologyCounter()

    if support_filepaths is None or len(support_filepaths) == 0:
        messenger.send_info("Reading trees from standard input.")
        srcs = [sys.stdin]
    else:
        messenger.send_info("%d source(s) to be processed." % len(support_filepaths))

        # do not want to have all files open at the same time
        # srcs = [open(f, "rU") for f in support_filepaths]

        # store filepaths, to open individually in loop
        srcs = support_filepaths

    for sidx, src in enumerate(srcs):

        # hack needed because we do not want to open all input files at the
        # same time; if not a file object, assume it is a file path and create
        # corresponding file object
        if not isinstance(src, file):
            src = open(src, "rU")

        name = getattr(src, "name", "<stdin>")
        messenger.send_info("Processing %d of %d: '%s'" % (sidx + 1, len(srcs), name), wrap=False)
        for tidx, tree in enumerate(
            tree_source_iter(
                src, schema=schema, taxon_set=taxon_set, store_tree_weights=weighted_trees, as_rooted=is_rooted
            )
        ):
            if tidx >= tree_offset:
                if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0):
                    messenger.send_info("(processing) '%s': tree at offset %d" % (name, tidx), wrap=False)
                treesplit.encode_splits(tree)
                split_distribution.count_splits_on_tree(tree)
                topology_counter.count(tree, tree_splits_encoded=True)
            else:
                if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0):
                    messenger.send_info("(processing) '%s': tree at offset %d (skipping)" % (name, tidx), wrap=False)
        try:
            src.close()
        except ValueError:
            # "I/O operation on closed file" if we try to close sys.stdin
            pass

    messenger.send_info("Serial processing of %d source(s) completed." % len(srcs))
    return split_distribution, topology_counter
Beispiel #52
0
def get_length_diffs(tree1,
                     tree2,
                     edge_length_attr="length",
                     value_type=float,
                     split_length_diff_map=False):
    """
    Returns a list of tuples, with the first element of each tuple representing
    the length of the branch subtending a particular split on ``tree1``, and
    the second element the length of the same branch on ``tree2``. If a
    particular split is found on one tree but not in the other, a value of zero
    is used for the missing split.
    """
    length_diffs = []
    split_length_diffs = {}
    if tree1.taxon_set is not tree2.taxon_set:
        raise TypeError("Trees have different TaxonSet objects: %s vs. %s" \
                % (hex(id(tree1.taxon_set)), hex(id(tree2.taxon_set))))
    if not hasattr(tree1, "split_edges"):
        treesplit.encode_splits(tree1)
    if not hasattr(tree2, "split_edges"):
        treesplit.encode_splits(tree2)
    split_edges2_copy = dict(
        tree2.split_edges)  # O(n*(2*bind + dict_item_cost))
    split_edges1_ref = tree1.split_edges
    for split, edge in split_edges1_ref.iteritems():  # O n : 2*bind
        elen1 = getattr(edge, edge_length_attr)  # attr + bind
        if elen1 is None:
            elen1 = 0  # worst-case: bind
        value1 = value_type(elen1)  #  ctor + bind
        try:
            e2 = split_edges2_copy.pop(split)  # attr + dict_lookup + bind
            elen2 = getattr(e2, edge_length_attr)  # attr + bind
            if elen2 is None:
                # allow root edge to have split with no value: raise error if not root edge
                if e2.tail_node is None:
                    elen2 = 0.0
                else:
                    raise ValueError(
                        "Edge length attribute is 'None': Tree: %s ('%s'), Split: %s"
                        % (tree2.oid, tree2.label,
                           tree2.taxon_set.split_as_newick_string(split)))
        except KeyError:  # excep
            elen2 = 0.0
        value2 = value_type(elen2)  #  ctor + bind # best case
        length_diffs.append((value1, value2))  # ctor + listappend
        split_length_diffs[split] = length_diffs[-1]

    for split, edge in split_edges2_copy.iteritems(
    ):  # best-case not executed, worst case O(n) : 2*bind
        elen2 = getattr(edge, edge_length_attr)  # attr +  bind
        if elen2 is None:
            elen2 = 0
        value2 = value_type(elen2)  #  ctor + bind
        e1 = split_edges1_ref.get(split)  # attr + dict_lookup + bind
        if e1 is None:
            elen1 = 0.0
        else:
            elen1 = getattr(e1, edge_length_attr)  # attr  + bind
            if elen1 is None:
                # allow root edge to have split with no value: raise error if not root edge
                if e1.tail_node is None:
                    elen1 = 0.0
                else:
                    raise ValueError(
                        "Edge length attribute is 'None': Tree: %s ('%s'), Split: %s"
                        % (tree1.oid, tree1.label, split))
                #elen1 = 0
        value1 = value_type(elen1)
        length_diffs.append((value1, value2))  # ctor + listappend
        split_length_diffs[split] = length_diffs[-1]
    # the numbers below do not reflect additions to the code to protect against
    #   edges with length None
    # loops
    #  best-case:
    #   O(n * (dict_lookup + 3*attr + 3*ctor + 7*bind + listappend))
    #  worst-case:
    #     separated: O(n * (2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + excep) + n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend))
    #   or:
    #     O(2n*(2*dict_lookup + 4*attr + 3*ctor + 8*bind + listappend + 0.5*excep))

    # total
    #  best-case:
    #       O(n * (dict_lookup + 3*attr + 3*ctor + 8*bind + listappend + dict_item_cost))
    #  worst-case:
    #     O(2n*(2*dict_lookup + 4*attr + 3*ctor + 9*bind + listappend + 0.5*(dict_item_cost + excep))
    if split_length_diff_map:
        return length_diffs, split_length_diffs
    else:
        return length_diffs