def main(): args = process_arg() newick_data = getInput(args.filename) if args.functionality == "costscape": costscape.solve(newick_data, args.dl, args.dh, args.tl, args.th, args) elif args.functionality == "reconcile": DTLReconGraph.reconcile_noninter(newick_data, args.d, args.t, args.l) elif args.functionality == "histogram": HistogramMain.compute_pdv(args.filename, newick_data, args.d, args.t, args.l, args) elif args.functionality == "clumpr": ClusterMain.perform_clustering(newick_data, args.d, args.t, args.l, args.k, args)
def test_topological_order(self): """ Test topological_order by generating host and parasite trees of different sizes and going through each reconciliaiton in the reconciliaiton graph to see if the order generated by topological_order is a topological order. We skip the reconciliations that are temporally inconsistent. """ count = 0 for tree_size in self.size_range: tree_size_Folder = '%s/size%d' % (self.generated_dir_path, tree_size) for newick in os.listdir(tree_size_Folder): if count >= self.num_examples_to_test: break if newick.startswith('.'): continue recon_input = newickFormatReader.getInput( os.path.join(tree_size_Folder, newick)) host_tree = recon_input.host_tree parasite_tree = recon_input.parasite_tree for d, t, l in itertools.product(range(1, 5), repeat=3): recon_graph, _, _, best_roots = DTLReconGraph.DP( recon_input, d, t, l) for reconciliation, _ in HistogramAlgTools.BF_enumerate_MPRs( recon_graph, best_roots): temporal_graph = recon_builder.build_temporal_graph( host_tree, parasite_tree, reconciliation) ordering_dict = recon_builder.topological_order( temporal_graph) # if there is no temporal inconsistency if ordering_dict != None: self.check_topological_order( temporal_graph, ordering_dict) count += 1
def calc_histogram(tree_data, d, t, l, time_it, normalize=False, zero_loss=False): """ Compute the PDV from a .newick file :param tree_data <ReconInput> - Output of newickFormatReader.getInput() :param d <float> - the cost of a duplication :param t <float> - ^^ transfer :param l <float> - ^^ loss :param time_it <bool> - collect timing info :param normalize <bool> - normalize the histogram by the size of the gene tree :param zero_loss <bool> - ignore loss events :return diameter_alg_hist <Histogram> - the PDV for the given .newick :return elapsed <float> - the time it took to compute the PDV None if time_it is False """ # From the newick tree create the reconciliation graph edge_species_tree, edge_gene_tree, dtl_recon_graph, mpr_count, best_roots \ = DTLReconGraph.reconcile(tree_data, d, t, l) # If we want to know the number of MPRs #print(mpr_count) # Reformat the host and parasite tree to use it with the histogram algorithm gene_tree, gene_tree_root, gene_node_count = Diameter.reformat_tree( edge_gene_tree, "pTop") species_tree, species_tree_root, species_node_count \ = Diameter.reformat_tree(edge_species_tree, "hTop") if time_it: start = time.time() # Calculate the histogram via histogram algorithm diameter_alg_hist = HistogramAlg.diameter_algorithm( species_tree, gene_tree, gene_tree_root, dtl_recon_graph, dtl_recon_graph, False, zero_loss) if time_it: end = time.time() elapsed = end - start else: elapsed = None if normalize: # Number of internal gene tree nodes gene_tree_nodes = int(math.ceil(len(gene_tree) / 2.0)) diameter_alg_hist = diameter_alg_hist.xscale(1.0 / (2 * gene_tree_nodes)) return diameter_alg_hist, elapsed
def get_tree_info(newick, d,t,l): """ Reconcile the trees and return all the relevant info. :param newick <ReconInput>: Output of newickFormatReader.getInput() :params d,t,l <float> - the relative DTL costs :return gene_tree <tree> :return species_tree <tree> :return gene_root <node> :return dtl_recon_graph <recon_graph> :return mpr_count <int> - the number of MPRs for the recon graph :return best_roots [<mapping_node>] - the sources of the recon graph """ # From the newick tree create the reconciliation graph edge_species_tree, edge_gene_tree, dtl_recon_graph, mpr_count, best_roots \ = DTLReconGraph.reconcile(newick, d, t, l) # Reformat the host and parasite tree to use it with the histogram algorithm gene_tree, gene_root, gene_node_count = Diameter.reformat_tree(edge_gene_tree, "pTop") species_tree, species_tree_root, species_node_count \ = Diameter.reformat_tree(edge_species_tree, "hTop") return gene_tree, species_tree, gene_root, dtl_recon_graph, mpr_count, best_roots
def main(): """ :return: nothing. This function will run the main loop for the command line interface. """ p = optparse.OptionParser(usage=usage()) p.add_option('-r', '--random', dest='random', help='Add a random median reconciliation from the full median' ' reconciliation graph of the given file to the output', action='store_true', default=False) p.add_option('-c', '--count', dest='count', help='Add the number of median reconciliations to' 'the output', action='store_true', default=False) options, args = p.parse_args() if len(args) == 4: try: # These will be the outputs we eventually return output = [] # Save arg values filename = args[0] dup = float(args[1]) transfer = float(args[2]) loss = float(args[3]) # Get basic info just about the dtl recon graph species_tree, gene_tree, dtl_recon_graph, mpr_count, best_roots = DTLReconGraph.reconcile( filename, dup, transfer, loss) # Reformat gene tree and get info on it, as well as for the species tree in the following line postorder_gene_tree, gene_tree_root, gene_node_count = Diameter.reformat_tree( gene_tree, "pTop") postorder_species_tree, species_tree_root, species_node_count = Diameter.reformat_tree( species_tree, "hTop") # Compute the median reconciliation graph median_reconciliation, n_meds, roots_for_median = get_median_graph( dtl_recon_graph, postorder_gene_tree, postorder_species_tree, gene_tree_root, best_roots) # We'll always want to output the median output.append(median_reconciliation) # Check if the user wants the number of medians if options.count: output.append(n_meds) # Check if the user wants a random median if options.random: med_counts = get_med_counts(median_reconciliation, roots_for_median) # Calculate a random, uniformly sampled single-path median from the median recon random_median = choose_random_median_wrapper( median_reconciliation, roots_for_median, med_counts) output.append(random_median) # Now print all of the output requested by the user for i in range(len(output)): if i != (len(output) - 1): print((str(output[i]) + '\n')) else: print((str(output[i]))) except ValueError: print((usage())) else: print(usage())
def compute_median(dtl_recon_graph, event_scores, postorder_mapping_nodes, mpr_roots): """ :param dtl_recon_graph: A dictionary representing a DTL Recon Graph. :param event_scores: A dictionary with event nodes as keys and values corresponding to the frequency of that events in MPR space for the recon graph :param postorder_mapping_nodes: A list of the mapping nodes in a possible MPR, except sorted first in postorder by species node and postorder by gene node :param mpr_roots: A list of mapping nodes that could act as roots to an MPR for the species and gene trees in question, output from the findBestRoots function in DTLReconGraph.py :return: A new dictionary which is has the same form as a DTL reconciliation graph except every mapping node only has one event node, along with the number of median reconciliations for the given DTL reconciliation graph, as well as the root of the median MPR for the given graph. Thus, this graph will represent a single reconciliation: the median reconciliation. """ # Note that for a symmetric median reconciliation, each frequency must have 0.5 subtracted from it # Initialize a dict that will store the running total frequency sum incurred up to the given mapping node, # and the event node that directly gave it that frequency sum. Keys are mapping nodes, values are tuples # consisting of a list of event nodes that maximize the frequency - 0.5 sum score for the lower level, # and the corresponding running total frequency - 0.5 sum up to that mapping node sum_freqs = dict() # Loop over all mapping nodes for the gene tree for map_node in postorder_mapping_nodes: # Contemporaneous events need to be caught from the get-go if dtl_recon_graph[map_node] == [('C', (None, None), (None, None))]: sum_freqs[map_node] = ([('C', (None, None), (None, None))], 0.5 ) # C events have freq 1, so 1 - 0.5 = 0.5 continue # Contemporaneous events should be a lone event in a list, so we move to the next mapping node # Get the events for the current mapping node and their running (frequency - 0.5) sums, in a list events = list() for event in dtl_recon_graph[map_node]: # Note that 'event' is of the form: ('event ID', 'Child 1', 'Child 2'), so the 0th element is the event # ID and the 1st and 2nd elements are the children produced by the event if event[ 0] == 'L': # Losses produce only one child, so we only need to look to one lower mapping node events.append( (event, sum_freqs[event[1]][1] + event_scores[event] - 0.5)) else: # Only other options are T, S, and D, which produce two children events.append( (event, sum_freqs[event[1]][1] + sum_freqs[event[2]][1] + event_scores[event] - 0.5)) # Find and save the max (frequency - 0.5) sum max_sum = max(events, key=itemgetter(1))[1] # Initialize list to find all events that gives the current mapping node the best (freq - 0.5) sum best_events = list() # Check to see which event(s) produce the max (frequency - 0.5) sum for event in events: if event[1] == max_sum: best_events.append(event[0]) # Help out the garage collector by discarding the now-useless non-optimal events list del events # Save the result for this mapping node so it can be used in higher mapping nodes in the graph sum_freqs[map_node] = (best_events[:], max_sum) # Get all possible roots of the graph and their running frequency scores, in a list, for later use possible_root_combos = [(root, sum_freqs[root][1]) for root in mpr_roots] # Find the best frequency - 0.5 sum for all of the potential roots for the median best_sum = max(possible_root_combos, key=itemgetter(1))[1] # Find all of the root combos for a median by filtering out the roots that don't give the best freq - 0.5 sum best_root_combos = list( [x for x in possible_root_combos if x[1] == best_sum]) # Extract just the roots from the previously filtered out list best_roots = [root[0] for root in best_root_combos] # Adjust the sum_freqs dictionary so we can use it with the buildDTLReconGraph function from DTLReconGraph.py for map_node in sum_freqs: # We place the event tuples into lists so they work well with the diameter algorithm sum_freqs[map_node] = sum_freqs[map_node][ 0] # Only use the events, no longer the associated frequency sum # Use the buildDTLReconGraph function from DTLReconGraph.py to find the median recon graph # Note that build_dtl... requires a list of the best roots for a reconciliation graph, the events for each # mapping node that are viable for an MPR (in our case, the median), and an empty dicitonary to populate # as the final return value med_recon_graph = DTLReconGraph.build_dtl_recon_graph( best_roots, sum_freqs, {}) # Check to make sure the median is a subgraph of the DTL reconciliation assert check_subgraph( dtl_recon_graph, med_recon_graph), 'Median is not a subgraph of the recon graph!' # We can use this function to find the number of medians once we've got the final median recon graph n_med_recons = DTLReconGraph.count_mprs_wrapper(best_roots, med_recon_graph) return med_recon_graph, n_med_recons, best_roots
if __name__ == '__main__': # Find the path to each tree sample file tree_dir = "newickSample" tree_paths = get_tree_paths(tree_dir, min_size) n_trees = len(tree_paths) tree_index = 0 for (tree_file, tree_size, tree_id) in tree_paths: tree_index += 1 print(tree_file) stdout.write("Processing tree: %d / %d\r" % (tree_index, n_trees)) stdout.flush() # Test different D, T, L values in {1, 2, 3, 4} for D, T, L in itertools.product([1, 2, 3, 4], repeat=3): # From the newick tree create the reconciliation graph edge_species_tree, edge_gene_tree, dtl_recon_graph, mpr_count, best_roots \ = DTLReconGraph.reconcile(tree_file, D, T, L) # Sanity check: the mpr_count returned is equal to the count generated via brute force assert (mpr_count == sum( 1 for _ in HistogramAlgTools.BF_enumerate_MPRs( dtl_recon_graph, best_roots))) # Calculate the histogram via brute force brute_force_hist = HistogramAlgTools.BF_find_histogram( dtl_recon_graph, best_roots) # Reformat the host and parasite tree to use it with the histogram algorithm gene_tree, gene_tree_root, gene_node_count = Diameter.reformat_tree( edge_gene_tree, "pTop") species_tree, species_tree_root, species_node_count \ = Diameter.reformat_tree(edge_species_tree, "hTop")
''' This demo demonstrates how you would use the reconciliation graph visualizer written by me and Dennis. ''' from empress.clumpr import DTLReconGraph # for creating a reconciliation graph from empress.clumpr import ReconciliationVisualization # for visualization # Since we currently have no way of storing reconciliation graph in a file, # we generate it every time when we run the algorithm # create a reconciliation graph from file result = DTLReconGraph.reconcile( "./newickSample/size5/test-size5-no700.newick", 2, 4, 2) # the result is a five-tuple host, paras, graph, num_recon, best_roots # we only want the reconciliation graph, which is the third item # in the tuple host, paras, graph, num_recon, best_roots = result # this visualize the graph and save it at './sampleVis700.png' ReconciliationVisualization.visualizeAndSave(graph, './sampleVis700.png')
def test_postorder(self): result = list (DTLReconGraph.postorder(self.tree, ('Top', 'A'))) expected = [('B', 'D'), ('B', 'E'), ('A', 'B'), ('A', 'C'), ('Top', 'A')] self.assertEqual(result, expected)
def count_mprs(g): # Find the mapping nodes involving the gene root roots = [k for k in list(g.keys()) if k[0] == gene_root] return DTLReconGraph.count_mprs_wrapper(roots, g)