Exemple #1
0
def BF_enter_hist(recongraph, uA, uB):
    '''
    Given a reconciliation graph and two mapping nodes,
    find the histogram that embodies the different pairs 
    of partial MPRs between uA and uB.
    '''
    if uA == uB:
        hist_dict = {}
        recon_trees = list(BF_enumerate_partial_MPRs(recongraph, uA))
        for recon_tree_i in range(0, len(recon_trees)):
            for recon_tree_j in range(recon_tree_i + 1):
                recon_tree_A = recon_trees[recon_tree_i]
                recon_tree_B = recon_trees[recon_tree_j]
                diff_count = recon_trees_diff(recon_tree_A, recon_tree_B)
                if diff_count not in hist_dict:
                    hist_dict[diff_count] = 0
                hist_dict[diff_count] += 1
        return Histogram(hist_dict)
    else:  # uA != uB
        hist_dict = {}
        uA_recon_trees = list(BF_enumerate_partial_MPRs(recongraph, uA))
        uB_recon_trees = list(BF_enumerate_partial_MPRs(recongraph, uB))
        for recon_tree_A in uA_recon_trees:
            for recon_tree_B in uB_recon_trees:
                diff_count = recon_trees_diff(recon_tree_A, recon_tree_B)
                if diff_count not in hist_dict:
                    hist_dict[diff_count] = 0
                hist_dict[diff_count] += 1
        return Histogram(hist_dict)
Exemple #2
0
    def test_combine2(self):
        histA = Histogram({0:1, 2:1, 3:2})
        histB = Histogram({2:1, 3:1, 10:1})
        new_hist = histA + histB

        self.assertEqual(histA.histogram_dict, {0:1, 2:1, 3:2})
        self.assertEqual(histB.histogram_dict, {2:1, 3:1, 10:1})
        self.assertEqual(new_hist.histogram_dict, {0:1, 2:2, 3:3, 10:1})
Exemple #3
0
    def test_combine1(self):
        histA = Histogram(0)
        histB = Histogram(None)
        new_hist = histA + histB

        self.assertEqual(histA.histogram_dict, {0:1})
        self.assertEqual(histB.histogram_dict, {})
        self.assertEqual(new_hist.histogram_dict, {0:1})
Exemple #4
0
    def test_product2(self):
        # TODO: update test to reflect new product function
        histA = Histogram({0:1, 2:1, 3:2})
        histB = Histogram({2:1, 3:1, 5:1})
        n_choices = 0
        new_hist = histA.product_combine(histB, n_choices)

        self.assertEqual(histA.histogram_dict, {0:1, 2:1, 3:2})
        self.assertEqual(histB.histogram_dict, {2:1, 3:1, 5:1})
        self.assertEqual(new_hist.histogram_dict, {2:1, 3:1, 4:1, 5:4, 6:2, 7:1, 8:2})
Exemple #5
0
    def test_product1(self):
        # TODO: update test to reflect new product function
        histA = Histogram(0)
        histB = Histogram(None)
        n_choices = 0
        new_hist = histA.product_combine(histB, n_choices)

        self.assertEqual(histA.histogram_dict, {0:1})
        self.assertEqual(histB.histogram_dict, {})
        self.assertEqual(new_hist.histogram_dict, {})
Exemple #6
0
def calculate_incomparable_enter_hist(zero_loss, enter_table, u, uA,
                                      uA_loss_events, uB, uB_loss_events,
                                      hist_both_exit):
    """
    Returns the enter table entry for [uA][uB] with the assumption that A is on a different part of the species
    tree from B
    :param zero_loss <bool>             - whether losses should not count
    :param enter_table <dict>           - the DP table we are computing part of
    :param u <str>                      - the gene node whose group we are in
    :param uA <str>                     - the first mapping node to compare
    :param uA_loss_events <list>        - a list of the loss events on that mapping node
    :param uB <str>                     - the second mapping node to compare
    :param uB_loss_events <list>        - a list of the loss events on that mapping node
    :param hist_both_exit <Histogram>   - the histogram of the double-exit that was previously calculated for uA and uB
    :return <Histogram>                 - the enter table entry for [uA][uB]
    """
    hists = [hist_both_exit]
    lost_hists = []

    # We add up all of the hists for both uA's and uB's loss events.
    for event in uA_loss_events:
        a_child = event[1][1]
        hists.append(enter_table[u][(u,
                                     a_child)][uB] << cost(event, zero_loss))
    for event in uB_loss_events:
        b_child = event[1][1]
        hists.append(enter_table[u][uA][(u,
                                         b_child)] << cost(event, zero_loss))
    # The previous histograms will overcount the possibility of taking a loss in both children.
    # Since enter[u][(u, a_child)][(u, b_child)] is counted by both
    # enter[u][uA][(u, b_child)] and enter[u][(u, a_child)][uB]
    # Here we compute enter[u][(u, a_child)][(u, b_child)] in order to subtract it off
    for loss_event_A, loss_event_B in product(uA_loss_events, uB_loss_events):
        a_child = loss_event_A[1][1]
        b_child = loss_event_B[1][1]
        loss_cost = cost(loss_event_A, zero_loss) + cost(
            loss_event_B, zero_loss)
        lost_hists.append(enter_table[u][(u, a_child)][(u,
                                                        b_child)] << loss_cost)
    return Histogram.sum(hists) - Histogram.sum(lost_hists)
Exemple #7
0
def calculate_equal_enter_hist(zero_loss, enter_table, u, uA, uA_loss_events,
                               uB, uB_loss_events, hist_both_exit,
                               exit_table_a, exit_table_b):
    """
    Returns the enter table entry for [uA][uB] with the assumption that uA equals uB (but they might have different
    loss events leading from them!)
    :param zero_loss <bool>             - whether losses should not count
    :param enter_table <dict>           - the DP table we are computing part of
    :param u <str>                      - the gene node whose group we are in
    :param uA <str>                     - the first mapping node to compare
    :param uA_loss_events <list>        - a list of the loss events on that mapping node
    :param uB <str>                     - the second mapping node to compare
    :param uB_loss_events <list>        - a list of the loss events on that mapping node
    :param hist_both_exit <Histogram>   - the histogram of the double-exit that was previously calculated for uA and uB
    :param exit_table_a <dict>          - the a exit table, which contains information about the single exit events for
                                          the mapping nodes' children
    :param exit_table_b <dict>          - the b exit table, which contains information about the single exit events for
                                          the mapping nodes' children
    :return <Histogram>                 - the enter table entry for [uA][uB]  
    """
    # If uA does not equal uB, then something's gone horribly wrong.
    assert uA == uB, "calculate_equal_enter_hist called on values of uA and uB that are not equal"

    # Build up a list of the possible histograms of this pair of mapping nodes, so that we can find the maximum later.
    hists = [hist_both_exit]

    for a_event in uA_loss_events:
        a_child = a_event[1][1]
        for b_event in uB_loss_events:
            b_child = b_event[1][1]
            # Only the first ordering matters ((a, b) and (b, a) will both appear, but we should not treat them as distinct)
            if a_child < b_child:
                hists.append(enter_table[u][(u, a_child)][(u, b_child)] << 2)
            # If they are the same, then the same loss was used so there is no shift.
            elif a_child == b_child:
                hists.append(enter_table[u][(u, a_child)][(u, b_child)])

    # Only need to iterate through uA_loss_events since if they are equal then uB_loss_events = uA_loss_events.
    for event in uA_loss_events:
        a_child = event[1][1]
        hists.append(exit_table_b[u][uB][(u,
                                          a_child)] << cost(event, zero_loss))
    return Histogram.sum(hists)
Exemple #8
0
def BF_find_histogram(recongraph, roots):
    '''
    Given a reconciliation graph, find the histogram of the graph via enumreating
    all of its reconciliation trees.
    '''
    hist_dict = {}
    recon_trees = [
        recon_tree
        for recon_tree, root in BF_enumerate_MPRs(recongraph, roots)
    ]
    for recon_tree_i in range(0, len(recon_trees)):
        for recon_tree_j in range(recon_tree_i + 1):
            recon_tree_A = recon_trees[recon_tree_i]
            recon_tree_B = recon_trees[recon_tree_j]
            diff_count = recon_trees_diff(recon_tree_A, recon_tree_B)
            if diff_count not in hist_dict:
                hist_dict[diff_count] = 0
            hist_dict[diff_count] += 1

    return Histogram(hist_dict)
Exemple #9
0
def diameter_algorithm(species_tree,
                       gene_tree,
                       gene_tree_root,
                       dtl_recon_graph_a,
                       dtl_recon_graph_b,
                       debug,
                       zero_loss,
                       verify=False):
    """
    This function finds the diameter of a reconciliation graph, as measured by the largest symmetric set difference
     of any two reconciliation trees inside of a reconciliation graph. While you can get standard diameter behaviour
     by making dtl_recon_graph_a equal dtl_recon_graph_b, arbitrary restrictions may be placed on which nodes are
     selected by choosing different graphs, for example by limiting one of the graphs to a single reconciliation tree
     to find that tree's distance to the furthest reconciliation.
    :param species_tree <dict>        - the species tree (in vertex form)
    :param gene_tree <dict>           - the gene tree (in vertex form)
    :param gene_tree_root <str>       - the root of the gene tree
    :param dtl_recon_graph_a <dict>   - one of the two DTL reconcilation graphs to make the diameter from
    :param dtl_recon_graph_b <dict>   - the other reconciliation graph. Both must share the same species and gene trees.
    :param debug <bool>               - whether or not to print out pretty tables
    :param zero_loss <bool>           - whether losses should count at all
    :param verify <bool>              - whether to verify the calculations using brute force
    :return <Histogram>               - the diameter of the reconciliation
    """

    # Use debugging
    #assert(dtl_recon_graph_a == dtl_recon_graph_b)
    if verify:
        verfier = BFVerifier(dtl_recon_graph_a)

    postorder_gene_nodes = list(gene_tree.keys())
    postorder_species_nodes = list(species_tree.keys())
    postorder_group_a = make_group_dict(gene_tree, dtl_recon_graph_a,
                                        postorder_species_nodes)
    postorder_group_b = make_group_dict(gene_tree, dtl_recon_graph_b,
                                        postorder_species_nodes)

    ancestral_table = calculate_ancestral_table(species_tree)

    if debug:
        print_table_nicely(ancestral_table, ", ", "Ancestral", "literal")

    exit_table_a = {}
    exit_table_b = {}

    enter_table = {}

    for u in postorder_gene_nodes:
        enter_table[u] = {}
        exit_table_a[u] = {}
        exit_table_b[u] = {}

        # Loop over every pair of mapping nodes in group(u)
        for uA in postorder_group_a[u]:
            enter_table[u][uA] = {}
            for uB in postorder_group_b[u]:

                hist_both_exit = calculate_hist_both_exit(
                    zero_loss, enter_table, u, gene_tree, uA,
                    dtl_recon_graph_a, uB, dtl_recon_graph_b)

                # Look up ancestry string in the precomputed table (indexed by the species nodes of the mapping nodes)
                ancestry = ancestral_table[uA[1]][uB[1]]

                uA_loss_events = [
                    event for event in dtl_recon_graph_a[uA]
                    if isinstance(event, tuple) and event[0] == 'L'
                ]
                uB_loss_events = [
                    event for event in dtl_recon_graph_b[uB]
                    if isinstance(event, tuple) and event[0] == 'L'
                ]

                # To compute the proper single exit entry, we must know how the two nodes relate to each other. See the
                # header for a more complete explanation on this data structure.
                if ancestry == 'in':
                    hist = calculate_incomparable_enter_hist(
                        zero_loss, enter_table, u, uA, uA_loss_events, uB,
                        uB_loss_events, hist_both_exit)
                elif ancestry == 'eq':
                    hist = calculate_equal_enter_hist(zero_loss, enter_table,
                                                      u, uA, uA_loss_events,
                                                      uB, uB_loss_events,
                                                      hist_both_exit,
                                                      exit_table_a,
                                                      exit_table_b)
                # The only difference between the 'des' and 'an' cases are whether the nodes should be swapped
                elif ancestry == 'des':
                    hist = calculate_ancestral_enter_hist(
                        zero_loss, True, enter_table, u, uA, uA_loss_events,
                        uB, uB_loss_events, hist_both_exit, exit_table_a,
                        exit_table_b)
                elif ancestry == 'an':
                    hist = calculate_ancestral_enter_hist(
                        zero_loss, False, enter_table, u, uA, uA_loss_events,
                        uB, uB_loss_events, hist_both_exit, exit_table_a,
                        exit_table_b)
                else:
                    raise ValueError(
                        "Invalid ancestry type '{0}', check calculate_ancestral_table()."
                        .format(ancestry))
                if verify:
                    verfier.verify_enter(uA, uB, hist)
                enter_table[u][uA][uB] = hist
                if debug:
                    print("{0} -{1}-> {2}, Double-equal\t{3}\Hist:{4}".format(
                        uA, ancestry, uB, hist_both_exit, hist))

        if debug:
            print_table_nicely(enter_table[u], ", ",
                               "EnterTable({0})".format(u))

    if debug:
        print("Exit Table A: {0}".format(exit_table_a))
        print("")
        print("Exit Table B: {0}".format(exit_table_b))
    # Now, the diameter of this reconciliation will be the maximum entry on the enter table.
    result = Histogram(None)
    for uA in enter_table[gene_tree_root]:
        for uB in enter_table[gene_tree_root][uA]:
            if uB > uA:
                continue
            entry = enter_table[gene_tree_root][uA][uB]
            result = result + entry
    return result
Exemple #10
0
def calculate_ancestral_enter_hist(zero_loss, is_swapped, enter_table, u, uA,
                                   uA_loss_events, uB, uB_loss_events,
                                   hist_both_exit, exit_table_a, exit_table_b):
    """
    Returns the enter table entry for [uA][uB] with the assumption that A is an ancestor of B (if is_swapped is
    false) or that B is an ancestor of A (if is_swapped is true). In both cases, it will compute the single exit
    table entry of the pair (with the ancestor going first, of course).
    :param zero_loss <bool>             - whether losses should not count
    :param is_swapped <bool>            - whether B is an ancestor of A (instead of the assumed A is an ancestor of B)
    :param enter_table <dict>           - the DP table we are computing part of
    :param u <str>                      - the gene node whose group we are in
    :param uA <str>                     - the first mapping node to compare
    :param uA_loss_events <list>        - a list of the loss events on that mapping node
    :param uB <str>                     - the second mapping node to compare
    :param uB_loss_events <list>        - a list of the loss events on that mapping node
    :param hist_both_exit <Histogram>   - the histogram of the double-exit that was previously calculated for uA and uB
    :param exit_table_a <dict>          - the a exit table, which contains information about the single exit events for
                                          the mapping nodes' children
    :param exit_table_b <dict>          - the b exit table, which contains information about the single exit events for
                                          the mapping nodes' children
    :return <Histogram>                 - the enter table entry for [uA][uB]
    """

    # In both cases, we will need to tally up the histograms of any loss events on the descendant. Hists will hold those
    # values, and the histograms of a double exit.
    hists = [hist_both_exit]

    # We check to see if which mapping node is the ancestor is swapped from uA an uB to uB an uA. We can't just
    # swap the arguments in that case unfortunately, because enter_table requires the two arguments be entered in the
    # correct direction.
    if not is_swapped:
        # uA is an ancestor to uB
        # Tally up the histograms of the descendant's (uB's) loss events
        for event in uB_loss_events:
            b_child = event[1][1]
            # Add the histogram of taking this loss (the exit_table's entry for the mapping node that this loss
            # leads to, plus the cost of a loss)
            hists += [
                exit_table_a[u][uA][(u, b_child)] << cost(event, zero_loss)
            ]

        # Initialize the ancestor's (uA) entry in exit_table, if need be.
        if uA not in exit_table_a[u]:
            exit_table_a[u][uA] = {}
        exit_table_a[u][uA][uB] = Histogram.sum(hists)

        enter_hists = [exit_table_a[u][uA][uB]]
        for event in uA_loss_events:
            a_child = event[1][1]
            # Double the nonzero entries if one node is the direct child of the other through the loss event.
            # This occurs because order matters when considering a pair of sub-reconciliations rooted at the child.
            # Either of those sub-reconciliations may be given the loss event and used as the sub-reconciliation
            # rooted at the parent.
            if (u, a_child) == uB:
                event_enter = enter_table[u][(
                    u, a_child)][uB].double_nonzero_entry()
            else:
                event_enter = enter_table[u][(u, a_child)][uB]
            enter_hists += [event_enter << cost(event, zero_loss)]
        return Histogram.sum(enter_hists)
    else:
        # uB is an ancestor to uA
        # Tally up the histograms of the descendant's (uA's) loss events
        for event in uA_loss_events:
            a_child = event[1][1]
            # Add the histograms of taking this loss (the exit_table's entry for the mapping node that this loss
            # leads to, plus the cost of a loss)
            hists += [
                exit_table_b[u][uB][(u, a_child)] << cost(event, zero_loss)
            ]

        # Initialize the ancestor's (uB) entry in exit_table, if need be.
        if uB not in exit_table_b[u]:
            exit_table_b[u][uB] = {}
        exit_table_b[u][uB][uA] = Histogram.sum(hists)

        enter_hists = [exit_table_b[u][uB][uA]]
        for event in uB_loss_events:
            b_child = event[1][1]
            if uA == (u, b_child):
                event_enter = enter_table[u][uA][(
                    u, b_child)].double_nonzero_entry()
            else:
                event_enter = enter_table[u][uA][(u, b_child)]
            enter_hists += [event_enter << cost(event, zero_loss)]
        return Histogram.sum(enter_hists)
Exemple #11
0
def calculate_hist_both_exit(zero_loss, enter_table, u, gene_tree, uA,
                             dtl_recon_graph_a, uB, dtl_recon_graph_b):
    """
    This function computes the histogram of a 'double exit', where both mapping nodes exit immediately
    :param zero_loss <bool>           - a boolean value representing whether loss events should count for distance
    :param enter_table <dict>         - the enter table, which we use here
    :param u <str>                    - the gene node whose group we're in
    :param gene_tree <dict>           - the gene tree in vertex format
    :param uA <str>                   - the 'a' mapping node
    :param dtl_recon_graph_a <dict>   - the 'a' DTL reconciliation graph
    :param uB <str>                   - the 'b' mapping node
    :param dtl_recon_graph_b <dict>   - the 'b' DTL reconciliation graph
    :return <Histogram>               - the Histogram object of both mapping nodes exiting
    """
    hist_both_exit = Histogram(None)

    # Test to see if u is a leaf
    if is_leaf(u, gene_tree):
        if uA == uB and ('C', (None, None),
                         (None, None)) in dtl_recon_graph_a[uA]:
            hist_both_exit = Histogram(0)
    else:
        uA_exit_events = [
            event for event in dtl_recon_graph_a[uA]
            if isinstance(event, tuple) and is_exit_event(event)
        ]
        uB_exit_events = [
            event for event in dtl_recon_graph_b[uB]
            if isinstance(event, tuple) and is_exit_event(event)
        ]
        for e_a in uA_exit_events:
            child1 = e_a[1][0]
            child2 = e_a[2][0]
            # A1 and A2 are the species nodes of the two mapping nodes of e_a
            A1 = e_a[1][1]
            A2 = e_a[2][1]
            for e_b in uB_exit_events:
                # If the events are shared, only need the first ordering (the second will overcount)
                if uA == uB and e_b > e_a:
                    continue
                # B1 and B2 are the species nodes of the two mapping nodes of e_b
                # We need to account for the case that the children of u are in opposite order between the two events
                if child1 == e_b[1][0]:
                    B1 = e_b[1][1]
                    B2 = e_b[2][1]
                else:
                    B1 = e_b[2][1]
                    B2 = e_b[1][1]
                # Now, we need to turn the species nodes into the correct mapping nodes
                u1A = (child1, A1)
                u1B = (child1, B1)
                u2A = (child2, A2)
                u2B = (child2, B2)
                # If the histogram of this iteration's double exit is better than the old one, then the old one will
                # supersede this one
                left_entry = enter_table[child1][u1A][u1B]
                right_entry = enter_table[child2][u2A][u2B]
                # Techically n_choices encodes the number of choices beyond the first.
                n_choices = 0
                # 1 choice means either a choice about both children but not the event, or about the event and only one child.
                if (uA == uB and e_a == e_b) or (u1A == u1B or u2A == u2B):
                    n_choices = 1
                # 2 choices means a choice about both children AND the event.
                if u1A == u1B and u2A == u2B and e_a != e_b:
                    n_choices = 2
                # Do the convolution between left and right, then shift based on the difference of the events
                this_hist = left_entry.product_combine(right_entry, n_choices)
                if e_a != e_b:
                    this_hist = this_hist << (cost(e_a, zero_loss) +
                                              cost(e_b, zero_loss))
                else:
                    this_hist = this_hist << intersect_cost(0)
                # Final histogram is the sum over all event pairs
                hist_both_exit = hist_both_exit + this_hist
    return hist_both_exit
Exemple #12
0
    def test_shift1(self):
        hist = Histogram(None)
        new_hist = hist << 1

        self.assertEqual(new_hist.histogram_dict, {})
Exemple #13
0
    def test_shift3(self):
        hist = Histogram({0:1, 3:5})
        new_hist = hist << 1

        self.assertEqual(hist.histogram_dict, {0:1, 3:5})
        self.assertEqual(new_hist.histogram_dict, {1:1, 4:5})
Exemple #14
0
    def test_shift2(self):
        hist = Histogram(0)
        new_hist = hist << 3

        self.assertEqual(hist.histogram_dict, {0:1})
        self.assertEqual(new_hist.histogram_dict, {3:1})