Ejemplo n.º 1
0
        def countSplits(self, tc, is_rooted):
            _LOG.info(tc[0] + "; " + tc[1])
            tree_filepaths = [pathmap.tree_source_path(tc[0])]
            taxa_filepath = pathmap.tree_source_path(tc[1])
            paup_sd = paup.get_split_distribution(tree_filepaths,
                                                  taxa_filepath,
                                                  is_rooted=is_rooted,
                                                  burnin=0)
            taxon_set = paup_sd.taxon_set
            dp_sd = treesplit.SplitDistribution(taxon_set=taxon_set)
            dp_sd.ignore_edge_lengths = True
            dp_sd.ignore_node_ages = True
            dp_sd.is_rooted = is_rooted

            _LOG.debug("Taxon set: %s" % [t.label for t in taxon_set])
            taxa_mask = taxon_set.all_taxa_bitmask()
            taxon_set.lock()
            for tree_filepath in tree_filepaths:
                for tree in dataio.tree_source_iter(stream=open(
                        tree_filepath, "rU"),
                                                    schema='nexus',
                                                    taxon_set=taxon_set,
                                                    as_rooted=is_rooted):
                    self.assertIs(tree.taxon_set, dp_sd.taxon_set)
                    self.assertIs(tree.taxon_set, taxon_set)
                    treesplit.encode_splits(tree)
                    dp_sd.count_splits_on_tree(tree)

            self.assertEqual(dp_sd.total_trees_counted,
                             paup_sd.total_trees_counted)

            # SplitsDistribution counts trivial splits, whereas PAUP*
            # contree does not, so the following will not work
            #            assert len(dp_sd.splits) == len(paup_sd.splits),\
            #                 "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits))

            taxa_mask = taxon_set.all_taxa_bitmask()
            for split in dp_sd.splits:
                if not treesplit.is_trivial_split(split, taxa_mask):
                    self.assertIn(split, paup_sd.splits)
                    self.assertEqual(dp_sd.split_counts[split],
                                     paup_sd.split_counts[split])
                    paup_sd.splits.remove(split)

            # if any splits remain, they were not
            # in dp_sd or were trivial
            remaining_splits = list(paup_sd.splits)
            for split in remaining_splits:
                if treesplit.is_trivial_split(split, taxa_mask):
                    paup_sd.splits.remove(split)
            self.assertEqual(len(paup_sd.splits), 0)
Ejemplo n.º 2
0
 def setUp(self):
     self.taxon_set = dendropy.TaxonSet()
     self.support_trees = dendropy.TreeList.get_from_path(
         pathmap.tree_source_path("primates.beast.mcmc.trees"),
         "nexus",
         taxon_set=self.taxon_set,
         tree_offset=40)
     self.split_distribution = treesplit.SplitDistribution(
         taxon_set=self.taxon_set)
     self.split_distribution.is_rooted = True
     self.split_distribution.ignore_node_ages = False
     for tree in self.support_trees:
         tree.update_splits()
         self.split_distribution.count_splits_on_tree(tree)
Ejemplo n.º 3
0
 def build_split_distribution(bipartition_counts,
                              tree_count,
                              taxon_set,
                              is_rooted=False):
     """
     Returns a populated SplitDistribution object based on the given
     bipartition info, ``bipartition_counts``.
     ``bipartition_counts`` is a dictionary, where the keys are PAUP split
     info (e.g. '.*****.*.*.**************************************') and the
     value are the frequency of the split.
     """
     sd = treesplit.SplitDistribution(taxon_set=taxon_set)
     sd.is_rooted = is_rooted
     sd.total_trees_counted = tree_count
     for g in bipartition_counts:
         sd.add_split_count(paup_group_to_mask(g, normalized=not is_rooted),
                            bipartition_counts[g])
     return sd
Ejemplo n.º 4
0
 def count_splits_on_trees(self, tree_iterator, split_distribution=None, trees_splits_encoded=False):
     """
     Given a list of trees file, a SplitsDistribution object (a new one, or,
     if passed as an argument) is returned collating the split data in the files.
     """
     if split_distribution is None:
         split_distribution = treesplit.SplitDistribution()
     taxon_set = split_distribution.taxon_set
     for tree_idx, tree in enumerate(tree_iterator):
         self.total_trees_counted += 1
         if taxon_set is None:
             assert(split_distribution.taxon_set is None)
             split_distribution.taxon_set = tree.taxon_set
             taxon_set = tree.taxon_set
         else:
             assert(taxon_set is tree.taxon_set)
         if not trees_splits_encoded:
             treesplit.encode_splits(tree)
         split_distribution.count_splits_on_tree(tree)
     return split_distribution
Ejemplo n.º 5
0
 def __init__(self,
              work_queue,
              result_split_dist_queue,
              result_topology_hash_map_queue,
              schema,
              taxon_labels,
              is_rooted,
              ignore_node_ages,
              calc_tree_probs,
              weighted_trees,
              tree_offset,
              process_idx,
              messenger,
              messenger_lock,
              log_frequency=1000):
     multiprocessing.Process.__init__(self)
     self.work_queue = work_queue
     self.result_split_dist_queue = result_split_dist_queue
     self.result_topology_hash_map_queue = result_topology_hash_map_queue
     self.schema = schema
     self.taxon_labels = list(taxon_labels)
     self.taxon_set = dendropy.TaxonSet(self.taxon_labels)
     self.split_distribution = treesplit.SplitDistribution(
         taxon_set=self.taxon_set)
     self.split_distribution.is_rooted = is_rooted
     self.split_distribution.ignore_node_ages = ignore_node_ages
     self.is_rooted = is_rooted
     self.calc_tree_probs = calc_tree_probs
     self.topology_counter = treesum.TopologyCounter()
     self.weighted_trees = weighted_trees
     self.tree_offset = tree_offset
     self.process_idx = process_idx
     self.messenger = messenger
     self.messenger_lock = messenger_lock
     self.log_frequency = log_frequency
     self.kill_received = False
Ejemplo n.º 6
0
def process_sources_serial(support_filepaths, schema, is_rooted,
                           ignore_node_ages, calc_tree_probs, weighted_trees,
                           tree_offset, log_frequency, messenger):
    """
    Returns a SplitDistribution object summarizing all trees found in
    `support_filepaths`.
    """
    messenger.send_info("Running in serial mode.")
    taxon_set = dendropy.TaxonSet()
    split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set)
    split_distribution.ignore_node_ages = ignore_node_ages
    split_distribution.is_rooted = is_rooted
    topology_counter = treesum.TopologyCounter()

    if support_filepaths is None or len(support_filepaths) == 0:
        messenger.send_info("Reading trees from standard input.")
        srcs = [sys.stdin]
    else:
        messenger.send_info("%d source(s) to be processed." %
                            len(support_filepaths))

        # do not want to have all files open at the same time
        #srcs = [open(f, "rU") for f in support_filepaths]

        # store filepaths, to open individually in loop
        srcs = support_filepaths

    for sidx, src in enumerate(srcs):

        # hack needed because we do not want to open all input files at the
        # same time; if not a file object, assume it is a file path and create
        # corresponding file object
        if not isinstance(src, file):
            src = open(src, "rU")

        name = getattr(src, "name", "<stdin>")
        messenger.send_info("Processing %d of %d: '%s'" %
                            (sidx + 1, len(srcs), name),
                            wrap=False)
        for tidx, tree in enumerate(
                tree_source_iter(src,
                                 schema=schema,
                                 taxon_set=taxon_set,
                                 store_tree_weights=weighted_trees,
                                 as_rooted=is_rooted)):
            if tidx >= tree_offset:
                if (log_frequency == 1) or (tidx > 0 and log_frequency > 0
                                            and tidx % log_frequency == 0):
                    messenger.send_info(
                        "(processing) '%s': tree at offset %d" % (name, tidx),
                        wrap=False)
                treesplit.encode_splits(tree)
                split_distribution.count_splits_on_tree(tree)
                topology_counter.count(tree, tree_splits_encoded=True)
            else:
                if (log_frequency == 1) or (tidx > 0 and log_frequency > 0
                                            and tidx % log_frequency == 0):
                    messenger.send_info(
                        "(processing) '%s': tree at offset %d (skipping)" %
                        (name, tidx),
                        wrap=False)
        try:
            src.close()
        except ValueError:
            # "I/O operation on closed file" if we try to close sys.stdin
            pass

    messenger.send_info("Serial processing of %d source(s) completed." %
                        len(srcs))
    return split_distribution, topology_counter
Ejemplo n.º 7
0
def process_sources_parallel(num_processes, support_filepaths, schema,
                             is_rooted, ignore_node_ages, calc_tree_probs,
                             weighted_trees, tree_offset, log_frequency,
                             messenger):
    """
    Returns a SplitDistribution object summarizing all trees found in
    `support_filepaths`.
    """

    # describe
    messenger.send_info(
        "Running in multiprocessing mode (up to %d processes)." %
        num_processes)
    messenger.send_info("%d sources to be processed." %
                        (len(support_filepaths)))

    # pre-discover taxa
    tdfpath = support_filepaths[0]
    messenger.send_info("Pre-loading taxa based on '%s' ..." % tdfpath)
    taxon_set = discover_taxa(tdfpath, schema)
    taxon_labels = [str(t) for t in taxon_set]
    messenger.send_info("Found %d taxa: [%s]" %
                        (len(taxon_labels),
                         (', '.join(["'%s'" % t for t in taxon_labels]))))

    # load up queue
    messenger.send_info("Creating work queue ...")
    work_queue = multiprocessing.Queue()
    for f in support_filepaths:
        work_queue.put(f)

    # launch processes
    messenger.send_info("Launching worker processes ...")
    result_split_dist_queue = multiprocessing.Queue()
    result_topology_hash_map_queue = multiprocessing.Queue()
    messenger_lock = multiprocessing.Lock()
    for idx in range(num_processes):
        sct = SplitCountingWorker(
            work_queue,
            result_split_dist_queue=result_split_dist_queue,
            result_topology_hash_map_queue=result_topology_hash_map_queue,
            schema=schema,
            taxon_labels=taxon_labels,
            is_rooted=is_rooted,
            ignore_node_ages=ignore_node_ages,
            calc_tree_probs=calc_tree_probs,
            weighted_trees=weighted_trees,
            tree_offset=tree_offset,
            process_idx=idx,
            messenger=messenger,
            messenger_lock=messenger_lock,
            log_frequency=log_frequency)
        sct.start()

    # collate results
    result_count = 0
    split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set)
    split_distribution.is_rooted = is_rooted
    split_distribution.ignore_node_ages = ignore_node_ages
    topology_counter = treesum.TopologyCounter()
    while result_count < num_processes:
        result_split_dist = result_split_dist_queue.get()
        split_distribution.update(result_split_dist)
        result_topology_hash_map = result_topology_hash_map_queue.get()
        topology_counter.update_topology_hash_map(result_topology_hash_map)
        result_count += 1
    messenger.send_info("Recovered results from all worker processes.")
    return split_distribution, topology_counter