def BF_enter_hist(recongraph, uA, uB): ''' Given a reconciliation graph and two mapping nodes, find the histogram that embodies the different pairs of partial MPRs between uA and uB. ''' if uA == uB: hist_dict = {} recon_trees = list(BF_enumerate_partial_MPRs(recongraph, uA)) for recon_tree_i in range(0, len(recon_trees)): for recon_tree_j in range(recon_tree_i + 1): recon_tree_A = recon_trees[recon_tree_i] recon_tree_B = recon_trees[recon_tree_j] diff_count = recon_trees_diff(recon_tree_A, recon_tree_B) if diff_count not in hist_dict: hist_dict[diff_count] = 0 hist_dict[diff_count] += 1 return Histogram(hist_dict) else: # uA != uB hist_dict = {} uA_recon_trees = list(BF_enumerate_partial_MPRs(recongraph, uA)) uB_recon_trees = list(BF_enumerate_partial_MPRs(recongraph, uB)) for recon_tree_A in uA_recon_trees: for recon_tree_B in uB_recon_trees: diff_count = recon_trees_diff(recon_tree_A, recon_tree_B) if diff_count not in hist_dict: hist_dict[diff_count] = 0 hist_dict[diff_count] += 1 return Histogram(hist_dict)
def test_combine2(self): histA = Histogram({0:1, 2:1, 3:2}) histB = Histogram({2:1, 3:1, 10:1}) new_hist = histA + histB self.assertEqual(histA.histogram_dict, {0:1, 2:1, 3:2}) self.assertEqual(histB.histogram_dict, {2:1, 3:1, 10:1}) self.assertEqual(new_hist.histogram_dict, {0:1, 2:2, 3:3, 10:1})
def test_combine1(self): histA = Histogram(0) histB = Histogram(None) new_hist = histA + histB self.assertEqual(histA.histogram_dict, {0:1}) self.assertEqual(histB.histogram_dict, {}) self.assertEqual(new_hist.histogram_dict, {0:1})
def test_product2(self): # TODO: update test to reflect new product function histA = Histogram({0:1, 2:1, 3:2}) histB = Histogram({2:1, 3:1, 5:1}) n_choices = 0 new_hist = histA.product_combine(histB, n_choices) self.assertEqual(histA.histogram_dict, {0:1, 2:1, 3:2}) self.assertEqual(histB.histogram_dict, {2:1, 3:1, 5:1}) self.assertEqual(new_hist.histogram_dict, {2:1, 3:1, 4:1, 5:4, 6:2, 7:1, 8:2})
def test_product1(self): # TODO: update test to reflect new product function histA = Histogram(0) histB = Histogram(None) n_choices = 0 new_hist = histA.product_combine(histB, n_choices) self.assertEqual(histA.histogram_dict, {0:1}) self.assertEqual(histB.histogram_dict, {}) self.assertEqual(new_hist.histogram_dict, {})
def calculate_incomparable_enter_hist(zero_loss, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit): """ Returns the enter table entry for [uA][uB] with the assumption that A is on a different part of the species tree from B :param zero_loss <bool> - whether losses should not count :param enter_table <dict> - the DP table we are computing part of :param u <str> - the gene node whose group we are in :param uA <str> - the first mapping node to compare :param uA_loss_events <list> - a list of the loss events on that mapping node :param uB <str> - the second mapping node to compare :param uB_loss_events <list> - a list of the loss events on that mapping node :param hist_both_exit <Histogram> - the histogram of the double-exit that was previously calculated for uA and uB :return <Histogram> - the enter table entry for [uA][uB] """ hists = [hist_both_exit] lost_hists = [] # We add up all of the hists for both uA's and uB's loss events. for event in uA_loss_events: a_child = event[1][1] hists.append(enter_table[u][(u, a_child)][uB] << cost(event, zero_loss)) for event in uB_loss_events: b_child = event[1][1] hists.append(enter_table[u][uA][(u, b_child)] << cost(event, zero_loss)) # The previous histograms will overcount the possibility of taking a loss in both children. # Since enter[u][(u, a_child)][(u, b_child)] is counted by both # enter[u][uA][(u, b_child)] and enter[u][(u, a_child)][uB] # Here we compute enter[u][(u, a_child)][(u, b_child)] in order to subtract it off for loss_event_A, loss_event_B in product(uA_loss_events, uB_loss_events): a_child = loss_event_A[1][1] b_child = loss_event_B[1][1] loss_cost = cost(loss_event_A, zero_loss) + cost( loss_event_B, zero_loss) lost_hists.append(enter_table[u][(u, a_child)][(u, b_child)] << loss_cost) return Histogram.sum(hists) - Histogram.sum(lost_hists)
def calculate_equal_enter_hist(zero_loss, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit, exit_table_a, exit_table_b): """ Returns the enter table entry for [uA][uB] with the assumption that uA equals uB (but they might have different loss events leading from them!) :param zero_loss <bool> - whether losses should not count :param enter_table <dict> - the DP table we are computing part of :param u <str> - the gene node whose group we are in :param uA <str> - the first mapping node to compare :param uA_loss_events <list> - a list of the loss events on that mapping node :param uB <str> - the second mapping node to compare :param uB_loss_events <list> - a list of the loss events on that mapping node :param hist_both_exit <Histogram> - the histogram of the double-exit that was previously calculated for uA and uB :param exit_table_a <dict> - the a exit table, which contains information about the single exit events for the mapping nodes' children :param exit_table_b <dict> - the b exit table, which contains information about the single exit events for the mapping nodes' children :return <Histogram> - the enter table entry for [uA][uB] """ # If uA does not equal uB, then something's gone horribly wrong. assert uA == uB, "calculate_equal_enter_hist called on values of uA and uB that are not equal" # Build up a list of the possible histograms of this pair of mapping nodes, so that we can find the maximum later. hists = [hist_both_exit] for a_event in uA_loss_events: a_child = a_event[1][1] for b_event in uB_loss_events: b_child = b_event[1][1] # Only the first ordering matters ((a, b) and (b, a) will both appear, but we should not treat them as distinct) if a_child < b_child: hists.append(enter_table[u][(u, a_child)][(u, b_child)] << 2) # If they are the same, then the same loss was used so there is no shift. elif a_child == b_child: hists.append(enter_table[u][(u, a_child)][(u, b_child)]) # Only need to iterate through uA_loss_events since if they are equal then uB_loss_events = uA_loss_events. for event in uA_loss_events: a_child = event[1][1] hists.append(exit_table_b[u][uB][(u, a_child)] << cost(event, zero_loss)) return Histogram.sum(hists)
def BF_find_histogram(recongraph, roots): ''' Given a reconciliation graph, find the histogram of the graph via enumreating all of its reconciliation trees. ''' hist_dict = {} recon_trees = [ recon_tree for recon_tree, root in BF_enumerate_MPRs(recongraph, roots) ] for recon_tree_i in range(0, len(recon_trees)): for recon_tree_j in range(recon_tree_i + 1): recon_tree_A = recon_trees[recon_tree_i] recon_tree_B = recon_trees[recon_tree_j] diff_count = recon_trees_diff(recon_tree_A, recon_tree_B) if diff_count not in hist_dict: hist_dict[diff_count] = 0 hist_dict[diff_count] += 1 return Histogram(hist_dict)
def diameter_algorithm(species_tree, gene_tree, gene_tree_root, dtl_recon_graph_a, dtl_recon_graph_b, debug, zero_loss, verify=False): """ This function finds the diameter of a reconciliation graph, as measured by the largest symmetric set difference of any two reconciliation trees inside of a reconciliation graph. While you can get standard diameter behaviour by making dtl_recon_graph_a equal dtl_recon_graph_b, arbitrary restrictions may be placed on which nodes are selected by choosing different graphs, for example by limiting one of the graphs to a single reconciliation tree to find that tree's distance to the furthest reconciliation. :param species_tree <dict> - the species tree (in vertex form) :param gene_tree <dict> - the gene tree (in vertex form) :param gene_tree_root <str> - the root of the gene tree :param dtl_recon_graph_a <dict> - one of the two DTL reconcilation graphs to make the diameter from :param dtl_recon_graph_b <dict> - the other reconciliation graph. Both must share the same species and gene trees. :param debug <bool> - whether or not to print out pretty tables :param zero_loss <bool> - whether losses should count at all :param verify <bool> - whether to verify the calculations using brute force :return <Histogram> - the diameter of the reconciliation """ # Use debugging #assert(dtl_recon_graph_a == dtl_recon_graph_b) if verify: verfier = BFVerifier(dtl_recon_graph_a) postorder_gene_nodes = list(gene_tree.keys()) postorder_species_nodes = list(species_tree.keys()) postorder_group_a = make_group_dict(gene_tree, dtl_recon_graph_a, postorder_species_nodes) postorder_group_b = make_group_dict(gene_tree, dtl_recon_graph_b, postorder_species_nodes) ancestral_table = calculate_ancestral_table(species_tree) if debug: print_table_nicely(ancestral_table, ", ", "Ancestral", "literal") exit_table_a = {} exit_table_b = {} enter_table = {} for u in postorder_gene_nodes: enter_table[u] = {} exit_table_a[u] = {} exit_table_b[u] = {} # Loop over every pair of mapping nodes in group(u) for uA in postorder_group_a[u]: enter_table[u][uA] = {} for uB in postorder_group_b[u]: hist_both_exit = calculate_hist_both_exit( zero_loss, enter_table, u, gene_tree, uA, dtl_recon_graph_a, uB, dtl_recon_graph_b) # Look up ancestry string in the precomputed table (indexed by the species nodes of the mapping nodes) ancestry = ancestral_table[uA[1]][uB[1]] uA_loss_events = [ event for event in dtl_recon_graph_a[uA] if isinstance(event, tuple) and event[0] == 'L' ] uB_loss_events = [ event for event in dtl_recon_graph_b[uB] if isinstance(event, tuple) and event[0] == 'L' ] # To compute the proper single exit entry, we must know how the two nodes relate to each other. See the # header for a more complete explanation on this data structure. if ancestry == 'in': hist = calculate_incomparable_enter_hist( zero_loss, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit) elif ancestry == 'eq': hist = calculate_equal_enter_hist(zero_loss, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit, exit_table_a, exit_table_b) # The only difference between the 'des' and 'an' cases are whether the nodes should be swapped elif ancestry == 'des': hist = calculate_ancestral_enter_hist( zero_loss, True, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit, exit_table_a, exit_table_b) elif ancestry == 'an': hist = calculate_ancestral_enter_hist( zero_loss, False, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit, exit_table_a, exit_table_b) else: raise ValueError( "Invalid ancestry type '{0}', check calculate_ancestral_table()." .format(ancestry)) if verify: verfier.verify_enter(uA, uB, hist) enter_table[u][uA][uB] = hist if debug: print("{0} -{1}-> {2}, Double-equal\t{3}\Hist:{4}".format( uA, ancestry, uB, hist_both_exit, hist)) if debug: print_table_nicely(enter_table[u], ", ", "EnterTable({0})".format(u)) if debug: print("Exit Table A: {0}".format(exit_table_a)) print("") print("Exit Table B: {0}".format(exit_table_b)) # Now, the diameter of this reconciliation will be the maximum entry on the enter table. result = Histogram(None) for uA in enter_table[gene_tree_root]: for uB in enter_table[gene_tree_root][uA]: if uB > uA: continue entry = enter_table[gene_tree_root][uA][uB] result = result + entry return result
def calculate_ancestral_enter_hist(zero_loss, is_swapped, enter_table, u, uA, uA_loss_events, uB, uB_loss_events, hist_both_exit, exit_table_a, exit_table_b): """ Returns the enter table entry for [uA][uB] with the assumption that A is an ancestor of B (if is_swapped is false) or that B is an ancestor of A (if is_swapped is true). In both cases, it will compute the single exit table entry of the pair (with the ancestor going first, of course). :param zero_loss <bool> - whether losses should not count :param is_swapped <bool> - whether B is an ancestor of A (instead of the assumed A is an ancestor of B) :param enter_table <dict> - the DP table we are computing part of :param u <str> - the gene node whose group we are in :param uA <str> - the first mapping node to compare :param uA_loss_events <list> - a list of the loss events on that mapping node :param uB <str> - the second mapping node to compare :param uB_loss_events <list> - a list of the loss events on that mapping node :param hist_both_exit <Histogram> - the histogram of the double-exit that was previously calculated for uA and uB :param exit_table_a <dict> - the a exit table, which contains information about the single exit events for the mapping nodes' children :param exit_table_b <dict> - the b exit table, which contains information about the single exit events for the mapping nodes' children :return <Histogram> - the enter table entry for [uA][uB] """ # In both cases, we will need to tally up the histograms of any loss events on the descendant. Hists will hold those # values, and the histograms of a double exit. hists = [hist_both_exit] # We check to see if which mapping node is the ancestor is swapped from uA an uB to uB an uA. We can't just # swap the arguments in that case unfortunately, because enter_table requires the two arguments be entered in the # correct direction. if not is_swapped: # uA is an ancestor to uB # Tally up the histograms of the descendant's (uB's) loss events for event in uB_loss_events: b_child = event[1][1] # Add the histogram of taking this loss (the exit_table's entry for the mapping node that this loss # leads to, plus the cost of a loss) hists += [ exit_table_a[u][uA][(u, b_child)] << cost(event, zero_loss) ] # Initialize the ancestor's (uA) entry in exit_table, if need be. if uA not in exit_table_a[u]: exit_table_a[u][uA] = {} exit_table_a[u][uA][uB] = Histogram.sum(hists) enter_hists = [exit_table_a[u][uA][uB]] for event in uA_loss_events: a_child = event[1][1] # Double the nonzero entries if one node is the direct child of the other through the loss event. # This occurs because order matters when considering a pair of sub-reconciliations rooted at the child. # Either of those sub-reconciliations may be given the loss event and used as the sub-reconciliation # rooted at the parent. if (u, a_child) == uB: event_enter = enter_table[u][( u, a_child)][uB].double_nonzero_entry() else: event_enter = enter_table[u][(u, a_child)][uB] enter_hists += [event_enter << cost(event, zero_loss)] return Histogram.sum(enter_hists) else: # uB is an ancestor to uA # Tally up the histograms of the descendant's (uA's) loss events for event in uA_loss_events: a_child = event[1][1] # Add the histograms of taking this loss (the exit_table's entry for the mapping node that this loss # leads to, plus the cost of a loss) hists += [ exit_table_b[u][uB][(u, a_child)] << cost(event, zero_loss) ] # Initialize the ancestor's (uB) entry in exit_table, if need be. if uB not in exit_table_b[u]: exit_table_b[u][uB] = {} exit_table_b[u][uB][uA] = Histogram.sum(hists) enter_hists = [exit_table_b[u][uB][uA]] for event in uB_loss_events: b_child = event[1][1] if uA == (u, b_child): event_enter = enter_table[u][uA][( u, b_child)].double_nonzero_entry() else: event_enter = enter_table[u][uA][(u, b_child)] enter_hists += [event_enter << cost(event, zero_loss)] return Histogram.sum(enter_hists)
def calculate_hist_both_exit(zero_loss, enter_table, u, gene_tree, uA, dtl_recon_graph_a, uB, dtl_recon_graph_b): """ This function computes the histogram of a 'double exit', where both mapping nodes exit immediately :param zero_loss <bool> - a boolean value representing whether loss events should count for distance :param enter_table <dict> - the enter table, which we use here :param u <str> - the gene node whose group we're in :param gene_tree <dict> - the gene tree in vertex format :param uA <str> - the 'a' mapping node :param dtl_recon_graph_a <dict> - the 'a' DTL reconciliation graph :param uB <str> - the 'b' mapping node :param dtl_recon_graph_b <dict> - the 'b' DTL reconciliation graph :return <Histogram> - the Histogram object of both mapping nodes exiting """ hist_both_exit = Histogram(None) # Test to see if u is a leaf if is_leaf(u, gene_tree): if uA == uB and ('C', (None, None), (None, None)) in dtl_recon_graph_a[uA]: hist_both_exit = Histogram(0) else: uA_exit_events = [ event for event in dtl_recon_graph_a[uA] if isinstance(event, tuple) and is_exit_event(event) ] uB_exit_events = [ event for event in dtl_recon_graph_b[uB] if isinstance(event, tuple) and is_exit_event(event) ] for e_a in uA_exit_events: child1 = e_a[1][0] child2 = e_a[2][0] # A1 and A2 are the species nodes of the two mapping nodes of e_a A1 = e_a[1][1] A2 = e_a[2][1] for e_b in uB_exit_events: # If the events are shared, only need the first ordering (the second will overcount) if uA == uB and e_b > e_a: continue # B1 and B2 are the species nodes of the two mapping nodes of e_b # We need to account for the case that the children of u are in opposite order between the two events if child1 == e_b[1][0]: B1 = e_b[1][1] B2 = e_b[2][1] else: B1 = e_b[2][1] B2 = e_b[1][1] # Now, we need to turn the species nodes into the correct mapping nodes u1A = (child1, A1) u1B = (child1, B1) u2A = (child2, A2) u2B = (child2, B2) # If the histogram of this iteration's double exit is better than the old one, then the old one will # supersede this one left_entry = enter_table[child1][u1A][u1B] right_entry = enter_table[child2][u2A][u2B] # Techically n_choices encodes the number of choices beyond the first. n_choices = 0 # 1 choice means either a choice about both children but not the event, or about the event and only one child. if (uA == uB and e_a == e_b) or (u1A == u1B or u2A == u2B): n_choices = 1 # 2 choices means a choice about both children AND the event. if u1A == u1B and u2A == u2B and e_a != e_b: n_choices = 2 # Do the convolution between left and right, then shift based on the difference of the events this_hist = left_entry.product_combine(right_entry, n_choices) if e_a != e_b: this_hist = this_hist << (cost(e_a, zero_loss) + cost(e_b, zero_loss)) else: this_hist = this_hist << intersect_cost(0) # Final histogram is the sum over all event pairs hist_both_exit = hist_both_exit + this_hist return hist_both_exit
def test_shift1(self): hist = Histogram(None) new_hist = hist << 1 self.assertEqual(new_hist.histogram_dict, {})
def test_shift3(self): hist = Histogram({0:1, 3:5}) new_hist = hist << 1 self.assertEqual(hist.histogram_dict, {0:1, 3:5}) self.assertEqual(new_hist.histogram_dict, {1:1, 4:5})
def test_shift2(self): hist = Histogram(0) new_hist = hist << 3 self.assertEqual(hist.histogram_dict, {0:1}) self.assertEqual(new_hist.histogram_dict, {3:1})