def countSplits(self, tc, is_rooted): _LOG.info(tc[0] + "; " + tc[1]) tree_filepaths = [pathmap.tree_source_path(tc[0])] taxa_filepath = pathmap.tree_source_path(tc[1]) paup_sd = paup.get_split_distribution(tree_filepaths, taxa_filepath, is_rooted=is_rooted, burnin=0) taxon_set = paup_sd.taxon_set dp_sd = treesplit.SplitDistribution(taxon_set=taxon_set) dp_sd.ignore_edge_lengths = True dp_sd.ignore_node_ages = True dp_sd.is_rooted = is_rooted _LOG.debug("Taxon set: %s" % [t.label for t in taxon_set]) taxa_mask = taxon_set.all_taxa_bitmask() taxon_set.lock() for tree_filepath in tree_filepaths: for tree in dataio.tree_source_iter(stream=open( tree_filepath, "rU"), schema='nexus', taxon_set=taxon_set, as_rooted=is_rooted): self.assertIs(tree.taxon_set, dp_sd.taxon_set) self.assertIs(tree.taxon_set, taxon_set) treesplit.encode_splits(tree) dp_sd.count_splits_on_tree(tree) self.assertEqual(dp_sd.total_trees_counted, paup_sd.total_trees_counted) # SplitsDistribution counts trivial splits, whereas PAUP* # contree does not, so the following will not work # assert len(dp_sd.splits) == len(paup_sd.splits),\ # "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits)) taxa_mask = taxon_set.all_taxa_bitmask() for split in dp_sd.splits: if not treesplit.is_trivial_split(split, taxa_mask): self.assertIn(split, paup_sd.splits) self.assertEqual(dp_sd.split_counts[split], paup_sd.split_counts[split]) paup_sd.splits.remove(split) # if any splits remain, they were not # in dp_sd or were trivial remaining_splits = list(paup_sd.splits) for split in remaining_splits: if treesplit.is_trivial_split(split, taxa_mask): paup_sd.splits.remove(split) self.assertEqual(len(paup_sd.splits), 0)
def setUp(self): self.taxon_set = dendropy.TaxonSet() self.support_trees = dendropy.TreeList.get_from_path( pathmap.tree_source_path("primates.beast.mcmc.trees"), "nexus", taxon_set=self.taxon_set, tree_offset=40) self.split_distribution = treesplit.SplitDistribution( taxon_set=self.taxon_set) self.split_distribution.is_rooted = True self.split_distribution.ignore_node_ages = False for tree in self.support_trees: tree.update_splits() self.split_distribution.count_splits_on_tree(tree)
def build_split_distribution(bipartition_counts, tree_count, taxon_set, is_rooted=False): """ Returns a populated SplitDistribution object based on the given bipartition info, ``bipartition_counts``. ``bipartition_counts`` is a dictionary, where the keys are PAUP split info (e.g. '.*****.*.*.**************************************') and the value are the frequency of the split. """ sd = treesplit.SplitDistribution(taxon_set=taxon_set) sd.is_rooted = is_rooted sd.total_trees_counted = tree_count for g in bipartition_counts: sd.add_split_count(paup_group_to_mask(g, normalized=not is_rooted), bipartition_counts[g]) return sd
def count_splits_on_trees(self, tree_iterator, split_distribution=None, trees_splits_encoded=False): """ Given a list of trees file, a SplitsDistribution object (a new one, or, if passed as an argument) is returned collating the split data in the files. """ if split_distribution is None: split_distribution = treesplit.SplitDistribution() taxon_set = split_distribution.taxon_set for tree_idx, tree in enumerate(tree_iterator): self.total_trees_counted += 1 if taxon_set is None: assert(split_distribution.taxon_set is None) split_distribution.taxon_set = tree.taxon_set taxon_set = tree.taxon_set else: assert(taxon_set is tree.taxon_set) if not trees_splits_encoded: treesplit.encode_splits(tree) split_distribution.count_splits_on_tree(tree) return split_distribution
def __init__(self, work_queue, result_split_dist_queue, result_topology_hash_map_queue, schema, taxon_labels, is_rooted, ignore_node_ages, calc_tree_probs, weighted_trees, tree_offset, process_idx, messenger, messenger_lock, log_frequency=1000): multiprocessing.Process.__init__(self) self.work_queue = work_queue self.result_split_dist_queue = result_split_dist_queue self.result_topology_hash_map_queue = result_topology_hash_map_queue self.schema = schema self.taxon_labels = list(taxon_labels) self.taxon_set = dendropy.TaxonSet(self.taxon_labels) self.split_distribution = treesplit.SplitDistribution( taxon_set=self.taxon_set) self.split_distribution.is_rooted = is_rooted self.split_distribution.ignore_node_ages = ignore_node_ages self.is_rooted = is_rooted self.calc_tree_probs = calc_tree_probs self.topology_counter = treesum.TopologyCounter() self.weighted_trees = weighted_trees self.tree_offset = tree_offset self.process_idx = process_idx self.messenger = messenger self.messenger_lock = messenger_lock self.log_frequency = log_frequency self.kill_received = False
def process_sources_serial(support_filepaths, schema, is_rooted, ignore_node_ages, calc_tree_probs, weighted_trees, tree_offset, log_frequency, messenger): """ Returns a SplitDistribution object summarizing all trees found in `support_filepaths`. """ messenger.send_info("Running in serial mode.") taxon_set = dendropy.TaxonSet() split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set) split_distribution.ignore_node_ages = ignore_node_ages split_distribution.is_rooted = is_rooted topology_counter = treesum.TopologyCounter() if support_filepaths is None or len(support_filepaths) == 0: messenger.send_info("Reading trees from standard input.") srcs = [sys.stdin] else: messenger.send_info("%d source(s) to be processed." % len(support_filepaths)) # do not want to have all files open at the same time #srcs = [open(f, "rU") for f in support_filepaths] # store filepaths, to open individually in loop srcs = support_filepaths for sidx, src in enumerate(srcs): # hack needed because we do not want to open all input files at the # same time; if not a file object, assume it is a file path and create # corresponding file object if not isinstance(src, file): src = open(src, "rU") name = getattr(src, "name", "<stdin>") messenger.send_info("Processing %d of %d: '%s'" % (sidx + 1, len(srcs), name), wrap=False) for tidx, tree in enumerate( tree_source_iter(src, schema=schema, taxon_set=taxon_set, store_tree_weights=weighted_trees, as_rooted=is_rooted)): if tidx >= tree_offset: if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0): messenger.send_info( "(processing) '%s': tree at offset %d" % (name, tidx), wrap=False) treesplit.encode_splits(tree) split_distribution.count_splits_on_tree(tree) topology_counter.count(tree, tree_splits_encoded=True) else: if (log_frequency == 1) or (tidx > 0 and log_frequency > 0 and tidx % log_frequency == 0): messenger.send_info( "(processing) '%s': tree at offset %d (skipping)" % (name, tidx), wrap=False) try: src.close() except ValueError: # "I/O operation on closed file" if we try to close sys.stdin pass messenger.send_info("Serial processing of %d source(s) completed." % len(srcs)) return split_distribution, topology_counter
def process_sources_parallel(num_processes, support_filepaths, schema, is_rooted, ignore_node_ages, calc_tree_probs, weighted_trees, tree_offset, log_frequency, messenger): """ Returns a SplitDistribution object summarizing all trees found in `support_filepaths`. """ # describe messenger.send_info( "Running in multiprocessing mode (up to %d processes)." % num_processes) messenger.send_info("%d sources to be processed." % (len(support_filepaths))) # pre-discover taxa tdfpath = support_filepaths[0] messenger.send_info("Pre-loading taxa based on '%s' ..." % tdfpath) taxon_set = discover_taxa(tdfpath, schema) taxon_labels = [str(t) for t in taxon_set] messenger.send_info("Found %d taxa: [%s]" % (len(taxon_labels), (', '.join(["'%s'" % t for t in taxon_labels])))) # load up queue messenger.send_info("Creating work queue ...") work_queue = multiprocessing.Queue() for f in support_filepaths: work_queue.put(f) # launch processes messenger.send_info("Launching worker processes ...") result_split_dist_queue = multiprocessing.Queue() result_topology_hash_map_queue = multiprocessing.Queue() messenger_lock = multiprocessing.Lock() for idx in range(num_processes): sct = SplitCountingWorker( work_queue, result_split_dist_queue=result_split_dist_queue, result_topology_hash_map_queue=result_topology_hash_map_queue, schema=schema, taxon_labels=taxon_labels, is_rooted=is_rooted, ignore_node_ages=ignore_node_ages, calc_tree_probs=calc_tree_probs, weighted_trees=weighted_trees, tree_offset=tree_offset, process_idx=idx, messenger=messenger, messenger_lock=messenger_lock, log_frequency=log_frequency) sct.start() # collate results result_count = 0 split_distribution = treesplit.SplitDistribution(taxon_set=taxon_set) split_distribution.is_rooted = is_rooted split_distribution.ignore_node_ages = ignore_node_ages topology_counter = treesum.TopologyCounter() while result_count < num_processes: result_split_dist = result_split_dist_queue.get() split_distribution.update(result_split_dist) result_topology_hash_map = result_topology_hash_map_queue.get() topology_counter.update_topology_hash_map(result_topology_hash_map) result_count += 1 messenger.send_info("Recovered results from all worker processes.") return split_distribution, topology_counter