Esempio n. 1
0
def dup_loss_topology_prior(tree, stree, recon, birth, death, maxdoom=20, events=None):
    """
    Returns the log prior of a gene tree topology according to dup-loss model
    """

    def gene2species(gene):
        return recon[tree.nodes[gene]].name

    if events is None:
        events = phylo.label_events(tree, recon)
    leaves = set(tree.leaves())
    phylo.add_implied_spec_nodes(tree, stree, recon, events)

    pstree, snodes, snodelookup = spidir.make_ptree(stree)

    # get doomtable
    doomtable = calc_doom_table(stree, birth, death, maxdoom)

    prod = 0.0
    for node in tree:
        if events[node] == "spec":
            for schild in recon[node].children:
                nodes2 = [x for x in node.children if recon[x] == schild]
                if len(nodes2) > 0:
                    node2 = nodes2[0]
                    subleaves = get_sub_tree(node2, schild, recon, events)
                    nhist = birthdeath.num_topology_histories(node2, subleaves)
                    s = len(subleaves)
                    thist = stats.factorial(s) * stats.factorial(s - 1) / 2 ** (s - 1)

                    if len(set(subleaves) & leaves) == 0:
                        # internal
                        prod += log(num_redundant_topology(node2, gene2species, subleaves, True))
                    else:
                        # leaves
                        prod += log(num_redundant_topology(node2, gene2species, subleaves, False))

                else:
                    nhist = 1.0
                    thist = 1.0
                    s = 0

                t = sum(
                    stats.choose(s + i, i)
                    * birthdeath.prob_birth_death1(s + i, schild.dist, birth, death)
                    * exp(doomtable[snodelookup[schild]]) ** i
                    for i in range(maxdoom + 1)
                )

                prod += log(nhist) - log(thist) + log(t)

    # correct for renumbering
    nt = num_redundant_topology(tree.root, gene2species)
    prod -= log(nt)

    # phylo.removeImpliedSpecNodes(tree, recon, events)
    treelib.remove_single_children(tree)

    return prod
Esempio n. 2
0
def count_dup_loss_coal_tree(coal_tree, extra, stree, gene2species,
                             implied=True, locus_mpr=True):
    """count dup loss coal"""

    if not locus_mpr:
        raise Exception("not implemented")

    # TODO: use locus_recon and locus_events rather than MPR
    #       (currently, phylo.py reconciliation functions fail for non-MPR)
    locus_tree = extra["locus_tree"]
    locus_recon = phylo.reconcile(locus_tree, stree, gene2species)
    locus_events = phylo.label_events(locus_tree, locus_recon)
    coal_recon = extra["coal_recon"]

    ndup, nloss, nappear = phylo.count_dup_loss_tree(locus_tree, stree, gene2species,
                                                     locus_recon, locus_events)

    # add implied speciation nodes if desired
    # this must be added AFTER counting dups and losses since it affects loss inference
    if implied:
        added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events)

    # count coals
    ncoal = 0
    counts = coal.count_lineages_per_branch(coal_tree, coal_recon, locus_tree)
    for lnode, (count_bot, count_top) in counts.iteritems():
        n = max(count_top-1, 0)
        locus_recon[lnode].data['coal'] += n
        ncoal += n

    if implied:
        phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon, locus_events)

    return ndup, nloss, ncoal, nappear
Esempio n. 3
0
    def eval_proposal(self, proposal):
        """Compute cost of proposal"""

        if not phyloDLC.assert_daughters(proposal.locus_events, proposal.daughters):
            # ensure locus events (duplications) and daughters match
            ndup, nloss, ncoal = None, None, None
            dupcost, losscost, coalcost = util.INF, util.INF, util.INF
        else:
            # find dup cost
            if self.dupcost == 0:
                ndup = None
                dupcost = 0
            else:
                ndup = phylo.count_dup(proposal.locus_tree, proposal.locus_events)
                dupcost = ndup * self.dupcost

            # find loss cost
            if self.losscost == 0:
                nloss = None
                losscost = 0
            else:
                nloss = phylo.count_loss(proposal.locus_tree, self.stree, proposal.locus_recon)
                losscost = nloss * self.losscost

            # find coal cost (first ensure bounded coalescent is satisfied - should always be true based on how daughters are proposed)
            phyloDLC.assert_bounded_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree, proposal.daughters)
            if self.coalcost == 0:
                ncoal = None
                coalcost = 0
            else:
                # add implied speciation nodes if desired
                # this must be added AFTER counting dups and losses since it affects loss inference
                if self.implied:
                    added = phylo.add_implied_spec_nodes(proposal.locus_tree, self.stree,
                                                         proposal.locus_recon, proposal.locus_events)

                ncoal = phyloDLC.count_coal(self.coal_tree, proposal.coal_recon, proposal.locus_tree)
                coalcost = ncoal * self.coalcost

                if self.implied:
                    phylo.remove_implied_spec_nodes(proposal.locus_tree, added,
                                                    proposal.locus_recon, proposal.locus_events)

        # total cost
        cost = dupcost + losscost + coalcost

        # logging info
        info = {}
        info["ndup"] = ndup
        info["nloss"] = nloss
        info["ncoal"] = ncoal
        info["cost"] = cost
        proposal.data = info

        return cost
Esempio n. 4
0
    def eval_proposal(self, proposal):
        """Compute probability of proposal"""

        # compute recon probability
        phylo.add_implied_spec_nodes(proposal["locus_tree"], self.stree,
                                     proposal["locus_recon"],
                                     proposal["locus_events"])
        p = prob_dlcoal_recon_topology(self.coal_tree,
                                       proposal["coal_recon"],
                                       proposal["locus_tree"],
                                       proposal["locus_recon"],
                                       proposal["locus_events"],
                                       proposal["daughters"],
                                       self.stree, self.n,
                                       self.duprate, self.lossrate,
                                       self.pretime, self.premean,
                                       maxdoom=self.maxdoom,
                                       nsamples=self.nsamples,
                                       add_spec=False)
        treelib.remove_single_children(proposal["locus_tree"])
        phylo.subset_recon(proposal["locus_tree"], proposal["locus_recon"])

        return p
Esempio n. 5
0
def add_implied_spec_nodes(tree, stree, recon, events):
    """
    Add speciation nodes to tree that are implied but are not present because of gene losses.

    Extends phylo.add_implied_spec_nodes to handle non-MPR.
    Only guaranteed to work for binary trees.
    """

    added_spec = phylo.add_implied_spec_nodes(tree, stree, recon, events)

    added_dup = []
    for node in list(tree):
        schildren = [recon[child] for child in node.children]
        if len(schildren) > 1 and len(set(schildren)) == 1 and events[node] != "dup":
            added_dup.extend(add_spec_from_dup_nodes(node, tree, recon, events))

    assert is_full_tree(tree, stree, recon, events)

    return added_spec, added_dup
Esempio n. 6
0
def count_dup_loss_coal(coal_tree, extra, stree, implied=True):
    """Returns the number of duplications + transfers + losses in a gene tree"""

    locus_tree = extra["locus_tree"]
    locus_recon = extra["locus_recon"]
    locus_events = extra["locus_events"]
    coal_recon = extra["coal_recon"]

    ndup = count_dup(locus_tree, locus_events)
    nloss = count_loss(locus_tree, stree, locus_recon)

    if implied:
        # add implied speciation nodes if desired
        # this must be added AFTER counting dups and losses since it affects loss inference
        added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon, locus_events)
    ncoal = count_coal(coal_tree, coal_recon, locus_tree)
    if implied:
        phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon, locus_events)

    return ndup + nloss + ncoal
Esempio n. 7
0
def count_dup_loss_coal_tree(coal_tree,
                             extra,
                             stree,
                             gene2species,
                             implied=True,
                             locus_mpr=True):
    """count dup loss coal"""

    if not locus_mpr:
        raise Exception("not implemented")

    # TODO: use locus_recon and locus_events rather than MPR
    #       (currently, phylo.py reconciliation functions fail for non-MPR)
    locus_tree = extra["locus_tree"]
    locus_recon = phylo.reconcile(locus_tree, stree, gene2species)
    locus_events = phylo.label_events(locus_tree, locus_recon)
    coal_recon = extra["coal_recon"]

    ndup, nloss, nappear = phylo.count_dup_loss_tree(locus_tree, stree,
                                                     gene2species, locus_recon,
                                                     locus_events)

    # add implied speciation nodes if desired
    # this must be added AFTER counting dups and losses since it affects loss inference
    if implied:
        added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon,
                                             locus_events)

    # count coals
    ncoal = 0
    counts = coal.count_lineages_per_branch(coal_tree, coal_recon, locus_tree)
    for lnode, (count_bot, count_top) in counts.iteritems():
        n = max(count_top - 1, 0)
        locus_recon[lnode].data['coal'] += n
        ncoal += n

    if implied:
        phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon,
                                        locus_events)

    return ndup, nloss, ncoal, nappear
Esempio n. 8
0
def add_implied_spec_nodes(tree, stree, recon, events):
    """
    Add speciation nodes to tree that are implied but are not present because of gene losses.

    Extends phylo.add_implied_spec_nodes to handle non-MPR.
    Only guaranteed to work for binary trees.
    """

    added_spec = phylo.add_implied_spec_nodes(tree, stree, recon, events)

    added_dup = []
    for node in list(tree):
        schildren = [recon[child] for child in node.children]
        if len(schildren) > 1 and len(
                set(schildren)) == 1 and events[node] != "dup":
            added_dup.extend(add_spec_from_dup_nodes(node, tree, recon,
                                                     events))

    assert is_full_tree(tree, stree, recon, events)

    return added_spec, added_dup
Esempio n. 9
0
def count_dup_loss_coal(coal_tree, extra, stree, implied=True):
    """Returns the number of duplications + transfers + losses in a gene tree"""

    locus_tree = extra["locus_tree"]
    locus_recon = extra["locus_recon"]
    locus_events = extra["locus_events"]
    coal_recon = extra["coal_recon"]

    ndup = count_dup(locus_tree, locus_events)
    nloss = count_loss(locus_tree, stree, locus_recon)

    if implied:
        # add implied speciation nodes if desired
        # this must be added AFTER counting dups and losses since it affects loss inference
        added = phylo.add_implied_spec_nodes(locus_tree, stree, locus_recon,
                                             locus_events)
    ncoal = count_coal(coal_tree, coal_recon, locus_tree)
    if implied:
        phylo.remove_implied_spec_nodes(locus_tree, added, locus_recon,
                                        locus_events)

    return ndup + nloss + ncoal
Esempio n. 10
0
    def eval_proposal(self, proposal):
        """Compute cost of proposal"""

        if not phyloDLC.assert_daughters(proposal.locus_events,
                                         proposal.daughters):
            # ensure locus events (duplications) and daughters match
            ndup, nloss, ncoal = None, None, None
            dupcost, losscost, coalcost = util.INF, util.INF, util.INF
        else:
            # find dup cost
            if self.dupcost == 0:
                ndup = None
                dupcost = 0
            else:
                ndup = phylo.count_dup(proposal.locus_tree,
                                       proposal.locus_events)
                dupcost = ndup * self.dupcost

            # find loss cost
            if self.losscost == 0:
                nloss = None
                losscost = 0
            else:
                nloss = phylo.count_loss(proposal.locus_tree, self.stree,
                                         proposal.locus_recon)
                losscost = nloss * self.losscost

            # find coal cost (first ensure bounded coalescent is satisfied - should always be true based on how daughters are proposed)
            phyloDLC.assert_bounded_coal(self.coal_tree, proposal.coal_recon,
                                         proposal.locus_tree,
                                         proposal.daughters)
            if self.coalcost == 0:
                ncoal = None
                coalcost = 0
            else:
                # add implied speciation nodes if desired
                # this must be added AFTER counting dups and losses since it affects loss inference
                if self.implied:
                    added = phylo.add_implied_spec_nodes(
                        proposal.locus_tree, self.stree, proposal.locus_recon,
                        proposal.locus_events)

                ncoal = phyloDLC.count_coal(self.coal_tree,
                                            proposal.coal_recon,
                                            proposal.locus_tree)
                coalcost = ncoal * self.coalcost

                if self.implied:
                    phylo.remove_implied_spec_nodes(proposal.locus_tree, added,
                                                    proposal.locus_recon,
                                                    proposal.locus_events)

        # total cost
        cost = dupcost + losscost + coalcost

        # logging info
        info = {}
        info["ndup"] = ndup
        info["nloss"] = nloss
        info["ncoal"] = ncoal
        info["cost"] = cost
        proposal.data = info

        return cost
Esempio n. 11
0
def dlcoal_recon_old(tree, stree, gene2species,
                 n, duprate, lossrate,
                 pretime=None, premean=None,
                 nsearch=1000,
                 maxdoom=20, nsamples=100,
                 search=phylo.TreeSearchNni):
    """
    Perform reconciliation using the DLCoal model

    Returns (maxp, maxrecon) where 'maxp' is the probability of the
    MAP reconciliation 'maxrecon' which further defined as

    maxrecon = {'coal_recon': coal_recon,
                'locus_tree': locus_tree,
                'locus_recon': locus_recon,
                'locus_events': locus_events,
                'daughters': daughters}
    
    """

    # init coal tree
    coal_tree = tree

    # init locus tree as congruent to coal tree
    # equivalent to assuming no ILS
    locus_tree = coal_tree.copy()

    maxp = - util.INF
    maxrecon = None

    # init search
    locus_search = search(locus_tree)

    for i in xrange(nsearch):       
        # TODO: propose other reconciliations beside LCA
        locus_tree2 = locus_tree.copy()
        phylo.recon_root(locus_tree2, stree, gene2species, newCopy=False)
        locus_recon = phylo.reconcile(locus_tree2, stree, gene2species)
        locus_events = phylo.label_events(locus_tree2, locus_recon)

        # propose daughters (TODO)
        daughters = set()

        # propose coal recon (TODO: propose others beside LCA)
        coal_recon = phylo.reconcile(coal_tree, locus_tree2, lambda x: x)

        # compute recon probability
        phylo.add_implied_spec_nodes(locus_tree2, stree,
                                     locus_recon, locus_events)
        p = prob_dlcoal_recon_topology(coal_tree, coal_recon,
                                       locus_tree2, locus_recon, locus_events,
                                       daughters,
                                       stree, n, duprate, lossrate,
                                       pretime, premean,
                                       maxdoom=maxdoom, nsamples=nsamples,
                                       add_spec=False)
        treelib.remove_single_children(locus_tree2)

        if p > maxp:
            maxp = p
            maxrecon = {"coal_recon": coal_recon,
                        "locus_tree": locus_tree2,
                        "locus_recon": locus_recon,
                        "locus_events": locus_events,
                        "daughters": daughters}
            locus_tree = locus_tree2.copy()
            locus_search.set_tree(locus_tree)
        else:
            locus_search.revert()

        # perform local rearrangement to locus tree
        locus_search.propose()




    return maxp, maxrecon
Esempio n. 12
0
def prob_dlcoal_recon_topology(coal_tree, coal_recon,
                               locus_tree, locus_recon, locus_events,
                               daughters,
                               stree, n, duprate, lossrate,
                               pretime=None, premean=None,
                               maxdoom=20, nsamples=100,
                               add_spec=True):
    """
    Probability of a reconcile gene tree in the DLCoal model.

    coal_tree    -- coalescent tree
    coal_recon   -- reconciliation of coalescent tree to locus tree
    locus_tree   -- locus tree (has dup-loss)
    locus_recon  -- reconciliation of locus tree to species tree
    locus_events -- events dict for locus tree
    stree        -- species tree
    n            -- population sizes in species tree
    duprate      -- duplication rate
    lossrate     -- loss rate

    You must also specify one of the following
    pretime      -- starting time before species tree
    premean      -- mean starting time before species tree

    Note: locus tree must have implied speciation nodes present
    """

    dups = phylo.count_dup(locus_tree, locus_events)

    # ensure implicit speciations are present
    if add_spec:
        phylo.add_implied_spec_nodes(locus_tree, stree,
                                     locus_recon, locus_events)
    
    # init popsizes for locus tree
    stree_popsizes = coal.init_popsizes(stree, n)
    popsizes = {}
    for node in locus_tree:
        popsizes[node.name] = stree_popsizes[locus_recon[node].name]


    # duploss probability

    util.tic("top")
    dl_prob = spidir.calc_birth_death_prior(locus_tree, stree, locus_recon,
                                            duprate, lossrate,
                                            maxdoom=maxdoom)
    util.toc()
    
    # daughters probability
    d_prob = dups * log(.5)


    # integrate over duplication times using sampling
    prob = 0.0
    #util.tic("int")
    for i in xrange(nsamples):
        # sample duplication times

        locus_times = spidir.topology_prior.sample_dup_times(
            locus_tree, stree, locus_recon, duprate, lossrate, pretime,
            premean,
            events=locus_events)
        assert len(locus_times) == len(locus_tree.nodes), (
            len(locus_times), len(locus_tree.nodes))
        treelib.set_dists_from_timestamps(locus_tree, locus_times)

        # coal topology probability
        coal_prob = prob_coal_recon_topology(coal_tree, coal_recon,
                                             locus_tree, popsizes, daughters)
        
        prob += exp(coal_prob)
        print coal_prob
    #util.toc()

    return dl_prob + d_prob + util.safelog(prob / nsamples)
Esempio n. 13
0
def dup_loss_topology_prior(tree,
                            stree,
                            recon,
                            birth,
                            death,
                            maxdoom=20,
                            events=None):
    """
    Returns the log prior of a gene tree topology according to dup-loss model
    """
    def gene2species(gene):
        return recon[tree.nodes[gene]].name

    if events is None:
        events = phylo.label_events(tree, recon)
    leaves = set(tree.leaves())
    phylo.add_implied_spec_nodes(tree, stree, recon, events)

    pstree, snodes, snodelookup = spidir.make_ptree(stree)

    # get doomtable
    doomtable = calc_doom_table(stree, birth, death, maxdoom)

    prod = 0.0
    for node in tree:
        if events[node] == "spec":
            for schild in recon[node].children:
                nodes2 = [x for x in node.children if recon[x] == schild]
                if len(nodes2) > 0:
                    node2 = nodes2[0]
                    subleaves = get_sub_tree(node2, schild, recon, events)
                    nhist = birthdeath.num_topology_histories(node2, subleaves)
                    s = len(subleaves)
                    thist = stats.factorial(s) * stats.factorial(s - 1) / 2**(
                        s - 1)

                    if len(set(subleaves) & leaves) == 0:
                        # internal
                        prod += log(
                            num_redundant_topology(node2, gene2species,
                                                   subleaves, True))
                    else:
                        # leaves
                        prod += log(
                            num_redundant_topology(node2, gene2species,
                                                   subleaves, False))

                else:
                    nhist = 1.0
                    thist = 1.0
                    s = 0

                t = sum(
                    stats.choose(s + i, i) * birthdeath.prob_birth_death1(
                        s + i, schild.dist, birth, death) *
                    exp(doomtable[snodelookup[schild]])**i
                    for i in range(maxdoom + 1))

                prod += log(nhist) - log(thist) + log(t)

    # correct for renumbering
    nt = num_redundant_topology(tree.root, gene2species)
    prod -= log(nt)

    #phylo.removeImpliedSpecNodes(tree, recon, events)
    treelib.remove_single_children(tree)

    return prod