Exemple #1
0
def prob_locus_coal_recon_topology(tree, recon, locus_tree, n, daughters):
    """
    Returns the log probability of a reconciled gene tree ('tree', 'recon')
    from the coalescent model given a locus tree 'locus_tree',
    population sizes 'n', and daughters set 'daughters'
    """

    # initialize popsizes, lineage counts, and divergence times
    popsizes = coal.init_popsizes(locus_tree, n)
    lineages = coal.count_lineages_per_branch(tree, recon, locus_tree)
    locus_times = treelib.get_tree_timestamps(locus_tree)


    # calc log probability
    lnp = coal.pmrt(
        tree, recon, locus_tree, popsizes, lineages=lineages)

    def walk(node, gene_counts, leaves):
        if node.is_leaf():
            gene_counts[node.name] = lineages[node][0]
            leaves.add(node)
        else:
            for child in node.children:
                if child in daughters:
                    gene_counts[child.name] = 1
                    leaves.add(child)
                else:
                    walk(child, gene_counts, leaves)

    for daughter in daughters:
        # determine leaves of the coal subtree
        gene_counts = {}
        leaves = set()
        walk(daughter, gene_counts, leaves)

        p = coal.cdf_mrca_bounded_multicoal(
            gene_counts, locus_times[daughter.parent], locus_tree, popsizes,
            sroot=daughter, sleaves=leaves, stimes=locus_times)

        if p == -util.INF:
            return -util.INF

        lnp -= p
    
    return lnp
def show_coal_track2(tree_track):

    win = summon.Window()


    bgcolor = (1, 1, 1, .1)
    cmap = util.rainbow_color_map(low=0.0, high=1.0)
    tracks = {}

    maxage = 0
    for (start, end), tree in tree_track:
        print start
        l = []
        times = treelib.get_tree_timestamps(tree)
        nleaves = len(tree.leaves())
        maxage2 = 0
        for node in tree:
            if len(node.children) > 1:
                age = times[node]
                freq = len(node.leaves()) / float(nleaves)
                #sizes = [len(x.leaves()) for x in node.children]
                #m = max(sizes)
                #n = sum(sizes)
                #pval = 2 * (n - m) / float(n - 1)
                l.extend([color(*cmap.get(freq)), start, age, end, age])
                if age > maxage2:
                    maxage2 = age
        win.add_group(group(lines(*l), color(*bgcolor),
                      box(start, 0, end, maxage2, fill=True)))
        if maxage2 > maxage:
            maxage = maxage2

    def func():
        x, y = win.get_mouse_pos()
        print "pos=%s age=%f" % (util.int2pretty(int(x)), y)
    win.add_group(hotspot("click", 0, 0, end, maxage,
                          func))

    win.home("exact")


    return win
Exemple #3
0
def show_coal_track2(tree_track):

    win = summon.Window()


    bgcolor = (1, 1, 1, .1)
    cmap = util.rainbow_color_map(low=0.0, high=1.0)
    tracks = {}

    maxage = 0
    for (start, end), tree in tree_track:
        print(start)
        l = []
        times = treelib.get_tree_timestamps(tree)
        nleaves = len(tree.leaves())
        maxage2 = 0
        for node in tree:
            if len(node.children) > 1:
                age = times[node]
                freq = len(node.leaves()) / float(nleaves)
                #sizes = [len(x.leaves()) for x in node.children]
                #m = max(sizes)
                #n = sum(sizes)
                #pval = 2 * (n - m) / float(n - 1)
                l.extend([color(*cmap.get(freq)), start, age, end, age])
                if age > maxage2:
                    maxage2 = age
        win.add_group(group(lines(*l), color(*bgcolor),
                      box(start, 0, end, maxage2, fill=True)))
        if maxage2 > maxage:
            maxage = maxage2

    def func():
        x, y = win.get_mouse_pos()
        print("pos=%s age=%f" % (util.int2pretty(int(x)), y))
    win.add_group(hotspot("click", 0, 0, end, maxage,
                          func))

    win.home("exact")


    return win
Exemple #4
0
    def test_cdf_bmc(self):

        # test cdf mrca BMC
        stree = treelib.parse_newick(
            "((A:1000, B:1000):500, (C:700, D:700):800);")
        n = 1000
        gene_counts = dict.fromkeys(stree.leaf_names(), 1)
        T = 2000

        p = exp(coal.cdf_mrca_bounded_multicoal(gene_counts, T, stree, n))

        nsamples = 5000
        c = 0
        for i in xrange(nsamples):
            tree, recon = coal.sample_multicoal_tree(stree, n)
            if treelib.get_tree_timestamps(tree)[tree.root] < T:
                c += 1
        p2 = c / float(nsamples)

        fequal(p, p2, .05)
Exemple #5
0
def prob_locus_coal_recon_topology(tree, recon, locus_tree, n, daughters):

    ptree, nodes, nodelookup = dlcoal.make_ptree(tree)
    pltree, lnodes, lnodelookup = dlcoal.make_ptree(locus_tree)
    recon2 = dlcoal.make_recon_array(tree, recon, nodes, lnodelookup)

    popsizes = compbio.coal.init_popsizes(locus_tree, n)
    popsizes2 = [popsizes[lnode.name] for lnode in lnodes]

    ltimes = treelib.get_tree_timestamps(locus_tree)
    ltimes2 = [ltimes[lnode] for lnode in lnodes]

    daughters2 = [lnodelookup[lnode] for lnode in daughters]
    
    p = dlcoal.dlcoalc.prob_locus_coal_recon_topology(
        c_list(c_int, ptree), len(nodes), c_list(c_int, recon2),
        c_list(c_int, pltree), 0, len(lnodes),
        c_list(c_double, popsizes2), c_list(c_double, ltimes2),
        c_list(c_int, daughters2), len(daughters))
    
    return p
Exemple #6
0
def dlcoal_sims(outdir, nsims, stree, n, duprate, lossrate,
                start=0,
                freq=1.0, freqdup=.05, freqloss=.05, steptime=None,
                nsteps=100,
                full_log=False,
                **options):

    if steptime is None:
        stimes = treelib.get_tree_timestamps(stree)
        steptime = stimes[stree.root] / float(nsteps)
    
    for i in xrange(start, nsims):
        outfile = phylo.phylofile(outdir, str(i), "")
        util.makedirs(os.path.dirname(outfile))
        print "simulating", outfile

        # sample a new tree from DLCoal model
        coal_tree, ex = sample_dlcoal_hem(
            stree, n, duprate, lossrate,
            freq, freqdup, freqloss, steptime,
            keep_extinct=full_log,
            **options)

        # write datastructures
        dlcoal.write_dlcoal_recon(outfile, coal_tree, ex)
        if full_log:
            full_logfile = phylo.phylofile(outdir, str(i), ".locus.info")
            
            full_locus_tree = ex["full_locus_tree"]
            ex2 = generate_extras(stree, full_locus_tree)
            daughters = ex2["daughters"]

            out = open(full_logfile, "w")
            out.write("hem\t%d\n" % 
                      int(is_locus_tree_hemiplasy(full_locus_tree, daughters)))
            out.close()
Exemple #7
0
def sample_multilocus_tree(stree, n, leaf_counts=None,
                           daughters=set(),
                           namefunc=None):
    """
    Returns a gene tree from a multilocus coalescent process
    n -- population size (int or dict)
         If n is a dict it must map from species name to population size
    """
    
    # initialize vector for how many genes per extant species
    if leaf_counts is None:
        leaf_counts = dict((l, 1) for l in stree.leaf_names())

    # initialize function for generating new gene names
    if namefunc is None:
        spcounts = dict((l, 1) for l in stree.leaf_names())
        def namefunc(sp):
            name = sp + "_" + str(spcounts[sp])
            spcounts[sp] += 1
            return name

    stimes = treelib.get_tree_timestamps(stree)

    # initialize population sizes
    popsizes = coal.init_popsizes(stree, n)

    # init gene counts
    counts = dict((n.name, 0) for n in stree)
    counts.update(leaf_counts)

    # init lineage counts
    lineages = {stree.root: [None, None]}
    for node in stree.leaves():
        lineages[node] = [leaf_counts[node.name], None]
    for node in daughters:
        if node not in lineages:
            lineages[node] = [None, 1]
        else:
            lineages[node][1] = 1
        

    def get_subtree(node, leaves, leaf_counts2):
        """collects info of subtree rooted at node"""
        if node.is_leaf():
            leaves.add(node)
            leaf_counts2[node.name] = leaf_counts[node.name]
        else:
            for child in node.children:
                if child in daughters:
                    leaves.add(child)
                    leaf_counts2[child.name] = 1
                else:
                    get_subtree(child, leaves, leaf_counts2)

    # loop through subtrees
    for snode in chain(daughters, [stree.root]):
        # determine leaves of the coal subtree
        leaves = set()
        leaf_counts2 = {}
        get_subtree(snode, leaves, leaf_counts2)
        
        if snode.parent:
            T  = stimes[snode.parent]
        else:
            T = None

        # calc table
        prob_counts = coal.calc_prob_counts_table(
            leaf_counts2, T, stree, popsizes,
            sroot=snode, sleaves=leaves, stimes=stimes)
        
        # sample lineage counts
        try:
            coal.sample_lineage_counts(snode, leaves, popsizes, stimes, T,
                                       lineages, prob_counts)
        except:
            print snode.name
            treelib.draw_tree_names(stree, maxlen=8)
            util.print_dict(lineages, key=lambda x: x.name)
            raise


    # sample coal times
    tree, recon = coal.coal_cond_lineage_counts(
        lineages, stree.root, set(stree.leaves()),
        popsizes, stimes, None, namefunc)
    
    return tree, recon
Exemple #8
0
def prob_dlcoal_recon_topology(coal_tree, coal_recon,
                               locus_tree, locus_recon, locus_events,
                               daughters,
                               stree, n, duprate, lossrate,
                               pretime=None, premean=None,
                               nsamples=100,
                               add_spec=True, info=None):
    """
    Probability of a reconcile gene tree in the DLCoal model.

    coal_tree    -- coalescent tree
    coal_recon   -- reconciliation of coalescent tree to locus tree
    locus_tree   -- locus tree (has dup-loss)
    locus_recon  -- reconciliation of locus tree to species tree
    locus_events -- events dict for locus tree
    stree        -- species tree
    n            -- population sizes in species tree
    duprate      -- duplication rate
    lossrate     -- loss rate

    You must also specify one of the following
    pretime      -- starting time before species tree
    premean      -- mean starting time before species tree

    """

    
    # init popsizes for locus tree
    stree_popsizes = coal.init_popsizes(stree, n)
    popsizes = {}
    for node in locus_tree:
        popsizes[node.name] = stree_popsizes[locus_recon[node].name]
    
    
    # duploss probability
    dl_prob = duploss.prob_dup_loss(
        locus_tree, stree, locus_recon, locus_events,
        duprate, lossrate)
    
    # daughters probability
    dups = phylo.count_dup(locus_tree, locus_events)
    d_prob = dups * log(.5)
    
    # integrate over duplication times using sampling
    stimes = treelib.get_tree_timestamps(stree)
    prob = prob_locus_coal_recon_topology_samples(
        coal_tree, coal_recon,
        locus_tree, locus_recon, locus_events, popsizes,
        stree, stimes,
        daughters, duprate, lossrate, nsamples,
        pretime, premean)

    
    # logging info
    if info is not None:
        info["duploss_prob"] = dl_prob
        info["daughters_prob"] = d_prob
        info["coal_prob"] = prob
        info["prob"] = dl_prob + d_prob + prob - log(nsamples)
    
    return dl_prob + d_prob + prob - log(nsamples)
Exemple #9
0
def sample_dup_times(tree, stree, recon, birth, death,
                     pretime=None, premean=None, events=None):
    """
    Sample duplication times for a gene tree in the dup-loss model
    """

    if events is None:
        events = phylo.label_events(tree, recon)

    # get species tree timestamps
    stimes = treelib.get_tree_timestamps(stree)
    #treelib.check_timestamps(stree, stimes)

    # init timestamps for gene tree
    times = {}


    # set pretimes
    if events[tree.root] != "spec":
        if recon[tree.root] != stree.root:
            # tree root is a dup within species tree
            snode = recon[tree.root]
            start_time = stimes[snode.parent]
            time_span = start_time - stimes[snode]
        else:
            # tree root is a pre-spec dup
            if pretime is None:
                if premean is None:
                    raise Exception("must set pre-mean")

                pretime = 0.0
                while pretime == 0.0:
                    pretime = random.expovariate(1/premean)
            start_time = stimes[stree.root] + pretime
            time_span = pretime

        sample_dup_times_subtree(times, start_time, time_span, tree.root, 
                                 recon, events,
                                 stree, birth, death)

    # set times
    for node in tree.preorder():
        if events[node] == "spec":
            # set speciation time
            times[node] = stimes[recon[node]]


        elif (events[node] == "dup" and
              node.parent is not None and
              recon[node] != recon[node.parent]):
            # set duplication times within duplication subtree
            # node is duproot
            snode = recon[node]
            start_time = stimes[snode.parent]
            time_span = start_time - stimes[snode]
            sample_dup_times_subtree(times, start_time, time_span,
                                     node, 
                                     recon, events,
                                     stree, birth, death)
        elif events[node] == "gene":
            times[node] = 0.0

    return times
Exemple #10
0
def sample_dup_times(tree, stree, recon, birth, death, pretime=None, premean=None, events=None):
    """
    Sample duplication times for a gene tree in the dup-loss model

    NOTE: Implied speciation nodes must be present
    """

    def gene2species(gene):
        return recon[tree.nodes[gene]].name

    if events is None:
        events = phylo.label_events(tree, recon)

    # get species tree timestamps
    stimes = treelib.get_tree_timestamps(stree)
    # treelib.check_timestamps(stree, stimes)

    # init timestamps for gene tree
    times = {}

    # set pretimes
    if events[tree.root] != "spec":
        if recon[tree.root] != stree.root:
            # tree root is a dup within species tree
            snode = recon[tree.root]
            start_time = stimes[snode.parent]
            time_span = snode.dist

        if recon[tree.root] == stree.root:
            # tree root is a pre-spec dup
            if pretime is None:
                if premean is None:
                    raise Exception("must set pre-mean")

                pretime = 0.0
                while pretime == 0.0:
                    pretime = random.expovariate(1 / premean)
            start_time = stimes[stree.root] + pretime
            time_span = pretime

        sample_dup_times_subtree(times, start_time, time_span, tree.root, recon, events, stree, birth, death)

    # set times
    for node in tree.preorder():
        if events[node] == "spec":
            # set speciation time
            start_time = times[node] = stimes[recon[node]]
            if node.parent:
                if times[node] > times[node.parent]:
                    print "bad", node.name
                    # raise Exception("bad time")

            # set duplication times within duplication subtree
            for duproot in node.children:
                if events[duproot] == "dup":
                    snode = recon[duproot]
                    time_span = snode.dist

                    # assert start_time - time_span >= stimes[snode], \
                    #       (duproot.name, start_time, time_span, stimes[snode])
                    sample_dup_times_subtree(times, start_time, time_span, duproot, recon, events, stree, birth, death)
        elif events[node] == "gene":
            times[node] = 0.0

    return times
Exemple #11
0
def sample_dup_times(tree,
                     stree,
                     recon,
                     birth,
                     death,
                     pretime=None,
                     premean=None,
                     events=None):
    """
    Sample duplication times for a gene tree in the dup-loss model

    NOTE: Implied speciation nodes must be present
    """
    def gene2species(gene):
        return recon[tree.nodes[gene]].name

    if events is None:
        events = phylo.label_events(tree, recon)

    # get species tree timestamps
    stimes = treelib.get_tree_timestamps(stree)
    #treelib.check_timestamps(stree, stimes)

    # init timestamps for gene tree
    times = {}

    # set pretimes
    if events[tree.root] != "spec":
        if recon[tree.root] != stree.root:
            # tree root is a dup within species tree
            snode = recon[tree.root]
            start_time = stimes[snode.parent]
            time_span = snode.dist

        if recon[tree.root] == stree.root:
            # tree root is a pre-spec dup
            if pretime is None:
                if premean is None:
                    raise Exception("must set pre-mean")

                pretime = 0.0
                while pretime == 0.0:
                    pretime = random.expovariate(1 / premean)
            start_time = stimes[stree.root] + pretime
            time_span = pretime

        sample_dup_times_subtree(times, start_time, time_span, tree.root,
                                 recon, events, stree, birth, death)

    # set times
    for node in tree.preorder():
        if events[node] == "spec":
            # set speciation time
            start_time = times[node] = stimes[recon[node]]
            if node.parent:
                if times[node] > times[node.parent]:
                    print "bad", node.name
                    #raise Exception("bad time")

            # set duplication times within duplication subtree
            for duproot in node.children:
                if events[duproot] == "dup":
                    snode = recon[duproot]
                    time_span = snode.dist

                    #assert start_time - time_span >= stimes[snode], \
                    #       (duproot.name, start_time, time_span, stimes[snode])
                    sample_dup_times_subtree(times, start_time, time_span,
                                             duproot, recon, events, stree,
                                             birth, death)
        elif events[node] == "gene":
            times[node] = 0.0

    return times
Exemple #12
0
        lossrate = 0.12  # events/lineages/myr
        gentime = 0.1  # yr / gen
        popsizes = 2 * 1e6 * gentime / 1e6  # "normalized popsize" = 2 (diploid) * Ne * yr/gen * myr/yr (first 1e6 is the pop size)
        subrate = 5e-9 / gentime * 1e6  # sub/site/myr = sub/site/gen * gen/yr * yr/myr\

        rates, freqs, alphas = pllprob.optimize_parameters(alnfile,
                                                           partfile,
                                                           coal_tree_treefix,
                                                           threads=1,
                                                           seed=ALIGNMENT_SEED,
                                                           eps=1)

        nsamples_coal = 1
        nsamples_locus = 1

        times = treelib.get_tree_timestamps(stree)
        pretime = None
        premean = 0.5 * times[stree.root]

        p_raxml = prob_locus_gene_species_alignment_recon(alnfile,
                                                          partfile,
                                                          stree,
                                                          popsizes,
                                                          duprate,
                                                          lossrate,
                                                          subrate,
                                                          pretime,
                                                          premean,
                                                          coal_tree_raxml,
                                                          coal_recon_raxml,
                                                          nsamples_coal,