Example #1
0
def prob_locus_coal_recon_topology_samples(
        coal_tree, coal_recon,
        locus_tree, locus_recon, locus_events, popsizes,
        stree, stimes,
        daughters, duprate, lossrate, nsamples,
        pretime=None, premean=None):
    
    if dlcoalc:
        # sample some reason branch lengths just for logging purposes
        locus_times = duploss.sample_dup_times(
                locus_tree, stree, locus_recon, duprate, lossrate, pretime,
                premean,
                events=locus_events)
        treelib.set_dists_from_timestamps(locus_tree, locus_times)

        # use C code
        return coal.prob_locus_coal_recon_topology_samples(
            coal_tree, coal_recon,
            locus_tree, locus_recon, locus_events, popsizes,
            stree, stimes,
            daughters, duprate, lossrate, nsamples, pretime, premean)
    else:
        # python backup    
        prob = 0.0
        for i in xrange(nsamples):
            # sample duplication times
            locus_times = duploss.sample_dup_times(
                locus_tree, stree, locus_recon, duprate, lossrate, pretime,
                premean,
                events=locus_events)
            treelib.set_dists_from_timestamps(locus_tree, locus_times)

            # coal topology probability
            coal_prob = prob_locus_coal_recon_topology(
                coal_tree, coal_recon, locus_tree, popsizes, daughters)
            
            prob += exp(coal_prob)
        prob = util.safelog(prob / nsamples)

        return prob
Example #2
0
def prob_dlcoal_recon_topology(coal_tree, coal_recon,
                               locus_tree, locus_recon, locus_events,
                               daughters,
                               stree, n, duprate, lossrate,
                               pretime=None, premean=None,
                               maxdoom=20, nsamples=100,
                               add_spec=True):
    """
    Probability of a reconcile gene tree in the DLCoal model.

    coal_tree    -- coalescent tree
    coal_recon   -- reconciliation of coalescent tree to locus tree
    locus_tree   -- locus tree (has dup-loss)
    locus_recon  -- reconciliation of locus tree to species tree
    locus_events -- events dict for locus tree
    stree        -- species tree
    n            -- population sizes in species tree
    duprate      -- duplication rate
    lossrate     -- loss rate

    You must also specify one of the following
    pretime      -- starting time before species tree
    premean      -- mean starting time before species tree

    Note: locus tree must have implied speciation nodes present
    """

    dups = phylo.count_dup(locus_tree, locus_events)

    # ensure implicit speciations are present
    if add_spec:
        phylo.add_implied_spec_nodes(locus_tree, stree,
                                     locus_recon, locus_events)
    
    # init popsizes for locus tree
    stree_popsizes = coal.init_popsizes(stree, n)
    popsizes = {}
    for node in locus_tree:
        popsizes[node.name] = stree_popsizes[locus_recon[node].name]


    # duploss probability

    util.tic("top")
    dl_prob = spidir.calc_birth_death_prior(locus_tree, stree, locus_recon,
                                            duprate, lossrate,
                                            maxdoom=maxdoom)
    util.toc()
    
    # daughters probability
    d_prob = dups * log(.5)


    # integrate over duplication times using sampling
    prob = 0.0
    #util.tic("int")
    for i in xrange(nsamples):
        # sample duplication times

        locus_times = spidir.topology_prior.sample_dup_times(
            locus_tree, stree, locus_recon, duprate, lossrate, pretime,
            premean,
            events=locus_events)
        assert len(locus_times) == len(locus_tree.nodes), (
            len(locus_times), len(locus_tree.nodes))
        treelib.set_dists_from_timestamps(locus_tree, locus_times)

        # coal topology probability
        coal_prob = prob_coal_recon_topology(coal_tree, coal_recon,
                                             locus_tree, popsizes, daughters)
        
        prob += exp(coal_prob)
        print coal_prob
    #util.toc()

    return dl_prob + d_prob + util.safelog(prob / nsamples)
Example #3
0
def prob_gene_species_alignment_recon(alnfile,
                                      partfile,
                                      stree,
                                      popsizes,
                                      duprate,
                                      lossrate,
                                      subrate,
                                      beta,
                                      pretime,
                                      premean,
                                      coal_tree,
                                      coal_recon,
                                      nsamples_coal,
                                      locus_tree,
                                      locus_recon,
                                      nsamples_locus,
                                      daughters,
                                      rates,
                                      freqs,
                                      alphas,
                                      threads=1,
                                      seed=ALIGNMENT_SEED,
                                      eps=0.1,
                                      info=None):
    """
    Evaluate terms that depend on T^G and R^G.

    That is, fix T^L, R^L, and daughters and evaluate the double integral:
    int int P(t^L | T^L, R^L, S, theta) * P(T^G, R^G, t^G | t^L, T^L, daughters, R^L, theta) * P(A | T^G, t^G) dt^L dt^G

    This is the probability we used in the searching process. 

    alnfile           -- alignment file
    partfile          -- partition file
    stree             -- species tree
    popsizes          -- population sizes in species tree
    duprate           -- duplication rate
    lossrate          -- loss rate
    subrate           -- substitution rate
    beta              -- regularization parameter
    pretime           -- starting time before species tree
    premean           -- mean starting time before species tree

    coal_tree         -- coalescent tree
    coal_recon        -- reconciliation of coalescent tree to locus tree
    nsamples_coal     -- number of times to sample coal times t^G
    locus_tree        -- locus tree (has dup-loss)
    locus_recon       -- reconciliation of locus tree to species tree
    nsamples_locus    -- number of times to sample the locus tree times t^L
    daughters         -- daughter nodes
    
    rates, freqs, alphas  -- optimization parameters  

    """

    locus_events = phylo.label_events(locus_tree, locus_recon)

    # optimize the parameters
    # util.tic("optimize parameter")
    # rates, freqs, alphas = pllprob.optimize_parameters(alnfile, partfile, coal_tree,
    #                                                   threads=threads, seed=seed, eps=eps)
    # util.toc()
    # double integral
    double_integral_list = []
    double_integral = 0.0
    util.tic("recon prob")
    for i in xrange(nsamples_locus):

        # sample t^L, the unit should be in myr
        #util.tic("topo prob")
        locus_times = duploss.sample_dup_times(locus_tree,
                                               stree,
                                               locus_recon,
                                               duprate,
                                               lossrate,
                                               pretime,
                                               premean,
                                               events=locus_events)
        treelib.set_dists_from_timestamps(locus_tree, locus_times)

        # calculate P(T^G, R^G | T^L, t^L, daughters, theta)
        topology_prob = prob_locus_coal_recon_topology(coal_tree, coal_recon,
                                                       locus_tree, popsizes,
                                                       daughters)
        #util.toc()
        # for a fixed t^L, compute coal_prob
        # sample t^G for topology and compute the probabililty of observing the alignment using MonteCarlo integration
        coal_prob = 0.0
        alignment_prob_MonteCarlo = 0.0
        alignment_prob_list = []

        # check probability of lineage counts for this locus tree
        zero_lineage_prob = False

        #util.tic("set times")
        for lnode in locus_tree:
            lineages = coal.count_lineages_per_branch(coal_tree, coal_recon,
                                                      locus_tree)
            bottom_num, top_num = lineages[lnode]
            if lnode.parent:
                T = lnode.dist
            else:
                T = util.INF

            popsizes = popsizes
            lineage_prob = prob_coal_counts(bottom_num, top_num, T, popsizes)

            # set zero_lineage_prob = TRUE if one lineage returns zero probability
            if (lineage_prob == 0.0):
                zero_lineage_prob = True

        #util.toc()
        # if lineage_prob is zero, coal_prob is zero
        if zero_lineage_prob:
            coal_prob = -float("inf")

        # otherwise, we calculate the coal_prob
        else:
            for j in xrange(nsamples_coal):

                # sample coal times and set the coal_tree accordingly
                # locus tree branch lengths are in myr
                # make sure the input popsizes are scaled to fit the time unit (typically myr)

                try:
                    sample_coal_times_topology(coal_tree, coal_recon,
                                               locus_tree, popsizes)
                except (ZeroDivisionError, ValueError):
                    # bad sample
                    util.log("bad sample")
                    alignment_prob = -util.INF
                    continue

                #===============================================================================
                # (log) probability of observing the alignment
                #util.tic("alignment probability")

                # convert branch lengths from myr to sub/site
                for node in coal_tree:
                    node.dist *= subrate

                #util.tic("alignment prob")
                # set a regularization parameter beta
                print beta
                alignment_prob = beta * prob_alignment(alnfile,
                                                       partfile,
                                                       coal_tree,
                                                       rates,
                                                       freqs,
                                                       alphas,
                                                       threads=threads,
                                                       seed=seed,
                                                       eps=eps)
                #util.toc()
                ### util.log("p = %.6f" % alignment_prob)
                #util.toc()

                #===============================================================================
                ### util.log("   log p = %.6g" % alignment_prob)
                ### util.log("   p = %.6g" % exp(alignment_prob))
                alignment_prob_list.append(alignment_prob)

            ### util.log("p = %f" % alignment_prob_MonteCarlo)

            # log_sum_exp function exponentiate the log probability of observing alignment,
            # add them up, and take log again
            if len(alignment_prob_list) == 0:
                # all bad samples
                alignment_prob_MonteCarlo = -util.INF
            else:
                alignment_prob_MonteCarlo = log_sum_exp(
                    alignment_prob_list) - log(nsamples_coal)

            # P(T^G, R^G | T^L, t^L, daughters, theta) * $ P(t^G | ~) * P(A | T^G,t^G) dtG
            # coal_prob is a log probability
            coal_prob += topology_prob + alignment_prob_MonteCarlo

            # add coal probability to a list for further processing
        double_integral_list.append(coal_prob)

        # log_sum_exp function exponentiate the log probability of observing alignment,
        # add them up, and take log again
        double_integral = log_sum_exp(double_integral_list) - log(
            nsamples_locus)

        # logging info
        if info is not None:
            info["topology_prob"] = topology_prob  # one sample of t^L
            info[
                "alignment_prob"] = alignment_prob_MonteCarlo  # one sample of t^L, averaged over t^G
            info["coal_prob"] = double_integral
    util.toc()
    return double_integral