Esempio n. 1
0
def sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime):
    """
    Runs a relaxed fixation assumption simulation on a species tree.
    Some simplifying assumptions are made for this version of the simulator:
      1) All branches of the species tree have the same population size
      2) All branches of the species tree have the same duplication rate
      3) All branches of the species tree have the same loss rate
      4) All branches of the species tree have the same duplication effect
      5) All branches of the species tree have the same loss effect
      6) All branches of the species tree have the same time between forced
           frequency changes
      7) There is a single allele at the root of the species tree.
    A duplication/loss effect is the change in frequency for either event.
      Appropriate default values for these effects may need to be determined.
    Furture iterations should remove these assumptions by incorporating
      dictionaries to allow values for each branch.
    
    stree is the initial species tree; it may be mutated by the simulator
    popsize is the population size (assmpt. 1)
    freq is the allele frequency (assmpt. 7)
    dr is the duplication rate (in events/myr/indiv(?); assmpt. 2)
    lr is the loss rate (in events/myr/indiv(?); assmpt. 3)
    freqdup is the duplication effect (assmpt. 4)
    freqloss is the loss effect (assmpt. 5)
    forcetime is the maximum time between frequency changes (assmpt. 6)
    
    Update: 30 July 2010
     Will return the gene (locus) tree, as well as extra information including a
      reconciliation dictionary and an events dictionary.
    """
    
    ## sanity checks before running the simulator; may be removed or relaxed
    treelib.assert_tree(stree)
    assert popsize > 0
    assert 0.0 <= freq and freq <= 1.0
    assert dr >= 0.0
    assert lr >= 0.0
    assert 0.0 <= freqdup and freqdup <= 1.0
    assert 0.0 <= freqloss and freqloss <= 1.0
    assert forcetime >= 0.0
    
    if dr + lr <= 0.0:
        return stree.copy() # no duplications or losses => stree is final tree
    
    # note: the use of < instead of <= is intentional
    #  if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0)
    def event_is_dup(duprate, fullrate):
        return random.random() < duprate / fullrate
    
    
    def sim_walk(gtree, snode, gnode, p, s_walk_time=0.0, g_walk_time=0.0, \
                    time_until_force=forcetime, eventlog=[]):
###    Most of the variables are obvious from descriptions in sim_tree or similar.
###    eventlog is a log of events along the gtree branch; each entry has the form
###     (time_on_branch, event_type, frequency, species_node),
###     where 0.0 <= time_on_branch <= branch_node.dist
###     event_type is one of {'extinction', 'frequency', 'speciation', 
###       duplication', 'loss', 'root', 'gene'}, where 'root' is a unique event
###       not added during the sim_walk process
###     frequency is the branch frequency at the event time
###     species_node is the name of the node of the species tree branch 
###       in which the event occurs
        if p <= 0.0:
            ## EXTINCTION EVENT
            # gnode is 'parent' of extinct node
            #  create new_gnode
            new_gnode = treelib.TreeNode(gtree.new_name())
            new_gnode.dist = g_walk_time
            # set new_gnode's frequency
            new_gnode.data['freq'] = 0.0
            gtree.add_child(gnode, new_gnode)
            # add extinction event to the event log
            ext_event = (g_walk_time, 'extinction', 0.0, snode.name)
            eventlog.append(ext_event)
            # set new_gnode's event log
            new_gnode.data['log'] = eventlog
            eventlog = [] # should have no effect; added for debugging on 18 Oct 2010
        else: # put everything else in this block to avoid using returns
            p = min(p, 1.0) # sanity check
            eff_dr = dr * p # * popsize #??
            eff_lr = lr * p # * popsize #??
            eff_bothr = eff_dr + eff_lr
            event_time = stats.exponentialvariate(eff_bothr)
            remaining_s_dist = snode.dist - s_walk_time
            if event_time >= min(time_until_force, remaining_s_dist):
                # do not process D/L event; determine whether at force or speciation
                if time_until_force < remaining_s_dist:
                    ## FREQUENCY UPDATE EVENT
                    # sample a new frequency (note scaling to years from myr) # edit: not any more
                    newp = coal.sample_freq_CDF(p, popsize, forcetime) # * 1e6)
                      # TODO: if we decide not to reset time_until_force at 
                      #  speciation events, the newp generation will need to be
                      #  altered in some form (probably using a new variable)
                    # update walk times
                    new_s_walk_time = s_walk_time + time_until_force
                    new_g_walk_time = g_walk_time + time_until_force
                    # add frequency event to event log
                    freq_event = (new_g_walk_time, 'frequency', newp, snode.name)
                    eventlog.append(freq_event)
                    # continue the walk with a reset forcetime
                    sim_walk(gtree, snode, gnode, newp, \
                                s_walk_time=new_s_walk_time, \
                                g_walk_time=new_g_walk_time, \
                                eventlog=eventlog)
                    eventlog = [] # should have no effect; debug add on 18 Oct 2010
                else:
                    ## SPECIATION EVENT
                    # separate into separate root, non-root speciations
#                    if gnode.parent: # gnode not the root
                    if gnode.data['log'][-1][1] != 'root':
                        # sample a new frequency (note scaling to years from myr) # edit: not any more
                        newp = coal.sample_freq_CDF(p, popsize, remaining_s_dist) # * 1e6)
                        # create new_gnode for this event
                        new_gnode = treelib.TreeNode(gtree.new_name())
                        new_g_walk_time = g_walk_time + remaining_s_dist
                        new_gnode.dist = new_g_walk_time
                        # set new node's frequency
                        new_gnode.data['freq'] = newp
                        gtree.add_child(gnode, new_gnode)
                        # add speciation event to event log and set the new node's log
                        if snode.is_leaf():
                            gene_event = (new_g_walk_time, 'gene', newp, snode.name)
                            eventlog.append(gene_event)
                            new_gnode.data['log'] = eventlog
                            # end of walk on species branch
                            eventlog = [] # should have no effect; debug add on 18 Oct 2010
                        else:
                            spec_event = (new_g_walk_time, 'speciation', newp, snode.name)
                            eventlog.append(spec_event)
                            new_gnode.data['log'] = eventlog
                            for schild in snode.children:
                                sim_walk(gtree, schild, new_gnode, newp, eventlog=[])
                              # TODO: if we decide not to reset time_until_force at
                              #  speciation events, this sim_walk call will need updating
                            eventlog = [] # should have no effect; debug add on 18 Oct 2010
                    else: # gnode is the root
                        spec_event = (0.0, 'speciation', p, snode.name)
                        eventlog = gnode.data['log']
                        eventlog.append(spec_event)
                        gnode.data['log'] = eventlog
#                        ### debug print
#                        print
#                        print 'adding: ', eventlog
#                        ### end debug
                        for schild in snode.children:
                            sim_walk(gtree, schild, gnode, p, eventlog=[])
                        eventlog = [] # should have no effect; debug add on 18 Oct 2010
            else:
                # process D/L event
                # no WF updates for these events (modelling decision)
                new_s_walk_time = s_walk_time + event_time
                new_g_walk_time = g_walk_time + event_time
                new_time_until_force = time_until_force - event_time
                if event_is_dup(eff_dr, eff_bothr):
                    ## DUPLICATION EVENT
                    # create a node new_gnode for the duplication event
                    new_gnode = treelib.TreeNode(gtree.new_name())
                    new_gnode.dist = new_g_walk_time
                    # set new node's frequency
                    new_gnode.data['freq'] = p
                    gtree.add_child(gnode, new_gnode)
                    # add duplication event to event log and set the new node's log
                    dup_event = (new_g_walk_time, 'duplication', p, snode.name)
                    eventlog.append(dup_event)
                    new_gnode.data['log'] = eventlog
                    # recurse on remainder of original branch
                    sim_walk(gtree, snode, new_gnode, p, \
                                s_walk_time=new_s_walk_time, \
                                time_until_force = new_time_until_force, \
                                eventlog=[])
                    # recurse on dup tree with correct starting frequency
                    sim_walk(gtree, snode, new_gnode, freqdup, \
                                s_walk_time=new_s_walk_time, \
                                time_until_force = new_time_until_force, \
                                eventlog=[(0.0,'daughter',freqdup,snode.name)]) # added for daughter detection
                    eventlog = [] # should have no effect; debug add on 18 Oct 2010
                else:
                    ## LOSS EVENT
                    newp = max(p - freqloss, 0.0) # sanity check
                    # add loss event to event log
                    loss_event = (new_g_walk_time, 'loss', newp, snode.name)
                    eventlog.append(loss_event)
                    sim_walk(gtree, snode, gnode, newp, \
                                s_walk_time=new_s_walk_time, \
                                g_walk_time=new_g_walk_time, \
                                time_until_force=new_time_until_force, \
                                eventlog=eventlog)
                    eventlog = [] # should have no effect; debug add on 18 Oct 2010
    
    
    # main code
    
    # create new gene tree and simulate its evolution
    gtree = treelib.Tree()
    gtree.make_root()
    gtree.root.dist = 0.0
    gtree.root.data['freq'] = freq
    root_event = (0.0, 'root', freq, stree.root.name)
    gtree.root.data['log'] = [root_event]
    sim_walk(gtree, stree.root, gtree.root, freq) # should mutate gtree
    
    
#    # remove dead branches and single children (inside last method)
#    # note that the simplifyRoot argument was added to the treelib methods
#    #  so that gtree.root.dist is always equal to 0.0 (and this allows the
#    #  root to have a single child)
#    #  if this behavior is undesired later, we can simply remove the argument
#    #  and the root will be collapsed (and have >0 dist)
    extant_leaves = []
    for leaf in gtree.leaves():
        if leaf.data['freq'] > 0.0:
            extant_leaves.append(leaf.name)
    gtree = treelib.subtree_by_leaf_names(gtree, extant_leaves,
                                          keep_single=True)
    remove_single_children(gtree) # allows for correct logging of events
    extras = generate_extras(stree, gtree)
    return gtree, extras
Esempio n. 2
0
def sim_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime):
    """
    Runs a relaxed fixation assumption simulation on a species tree.
    Some simplifying assumptions are made for this version of the simulator:
      1) All branches of the species tree have the same population size
      2) All branches of the species tree have the same duplication rate
      3) All branches of the species tree have the same loss rate
      4) All branches of the species tree have the same duplication effect
      5) All branches of the species tree have the same loss effect
      6) All branches of the species tree have the same time between forced
           frequency changes
      7) There is a single allele at the root of the species tree.
    A duplication/loss effect is the change in frequency for either event.
      Appropriate default values for these effects may need to be determined.
    Furture iterations should remove these assumptions by incorporating
      dictionaries to allow values for each branch.
    
    stree is the initial species tree; it may be mutated by the simulator
    popsize is the population size (assmpt. 1)
    freq is the allele frequency (assmpt. 7)
    dr is the duplication rate (in events/myr/indiv(?); assmpt. 2)
    lr is the loss rate (in events/myr/indiv(?); assmpt. 3)
    freqdup is the duplication effect (assmpt. 4)
    freqloss is the loss effect (assmpt. 5)
    forcetime is the maximum time between frequency changes (assmpt. 6)
    """
    
    ## sanity checks before running the simulator; may be removed or relaxed
    treelib.assert_tree(stree)
    assert popsize > 0
    assert 0.0 <= freq and freq <= 1.0
    assert dr >= 0.0
    assert lr >= 0.0
    assert 0.0 <= freqdup and freqdup <= 1.0
    assert 0.0 <= freqloss and freqloss <= 1.0
    assert forcetime >= 0.0
    
    if dr + lr <= 0.0:
        return stree.copy() # no duplications or losses => stree is final tree
    
    # note: the use of < instead of <= is intentional
    #  if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0)
    def event_is_dup(duprate, fullrate):
        return random.random() < duprate / fullrate
    
    
    def sim_walk(gtree, snode, gnode, p, s_walk_time=0.0, g_walk_time=0.0, \
                    time_until_force=forcetime):
#        debugprint(" Sim on branch" + str(node.name) + " with frequency " + str(p) + " and walk time " + str(walk_time))
#        debugprint(" walking on " + str(gnode.name))
        if p <= 0.0:
            # gnode is 'parent' of extinct node
            #  create new_gnode, set data['freq'] = 0.0
            #  prune at the end
            new_gnode = treelib.TreeNode(gtree.new_name())
            new_gnode.dist = g_walk_time
            new_gnode.data['freq'] = 0.0
            gtree.add_child(gnode, new_gnode)
#            debugprint("   extinction on " + str(gnode.name))
        else: # put everything else in this block to avoid using returns
            p = min(p, 1.0) # sanity check
            eff_dr = dr * p # * popsize #??
            eff_lr = lr * p # * popsize #??
            eff_bothr = eff_dr + eff_lr
            event_time = stats.exponentialvariate(eff_bothr)
            remaining_s_dist = snode.dist - s_walk_time
            if event_time >= min(time_until_force, remaining_s_dist):
                # do not process D/L event; determine whether at force or speciation
                if time_until_force < remaining_s_dist:
                    # force new frequency
                    newp = coal.sample_freq_CDF(p, popsize, forcetime * 1e6)
                      # scale forcetime to years (in myr)
    #                debugprint("   Forced new frequency: " + str(newp))
                    ## TODO: may wish to log newp in node.data
                    new_s_walk_time = s_walk_time + time_until_force
                    new_g_walk_time = g_walk_time + time_until_force
                    sim_walk(gtree, snode, gnode, newp, \
                                s_walk_time=new_s_walk_time, \
                                g_walk_time=new_g_walk_time)
                      # continue walk with new frequency
                      # increase walk_times accordingly
                      # reset time_until_force to forcetime
                else:
                    # speciation event
                    newp = coal.sample_freq_CDF(p, popsize, remaining_s_dist * 1e6)
                      # scale remaining time into years (from myr)
                    new_gnode = treelib.TreeNode(gtree.new_name())
                    new_gnode.dist = g_walk_time + remaining_s_dist
                    new_gnode.data['freq'] = newp
                      # stores frequency of allele at the speciation event
                    gtree.add_child(gnode, new_gnode)
    #                debugprint("   Completed branch; new frequency: " + str(newp))
                    for schild in snode.children:
                        sim_walk(gtree, schild, new_gnode, newp)
    #                return # shouldn't be necessary
            else:
                # process D/L event
                # no WF updates for these events (modelling decision)
                new_s_walk_time = s_walk_time + event_time
                new_g_walk_time = g_walk_time + event_time
                new_time_until_force = time_until_force - event_time
                if event_is_dup(eff_dr, eff_bothr):
                    # perform duplication event
                    new_gnode = treelib.TreeNode(gtree.new_name())
                      # create a node new_gnode for the duplication event
                    new_gnode.dist = new_g_walk_time # set dist to dup
                    new_gnode.data['freq'] = p # set frequency at dup event
    #                debugprint("   Duplication occurred at walk time " + str(new_walk_time))
                    gtree.add_child(gnode, new_gnode)
#                    debugprint("  starting on orig of " + str(new_gnode.name))
                    sim_walk(gtree, snode, new_gnode, p, \
                                s_walk_time=new_s_walk_time, \
                                time_until_force = new_time_until_force)
                      # recurse on remainder of original branch
#                    debugprint("  starting on dup of " + str(new_gnode.name))
                    sim_walk(gtree, snode, new_gnode, freqdup, \
                                s_walk_time=new_s_walk_time, \
                                time_until_force = new_time_until_force)
                      # recurse on dup tree with correct starting frequency
    #                return
                else:
                    # perform loss event
                    newp = max(p - freqloss, 0.0) # sanity check
    #                debugprint("   Loss occurred at walk time " + str(new_walk_time) + " yielding new frequency " + str(newp))
                    sim_walk(gtree, snode, gnode, newp, \
                                s_walk_time=new_s_walk_time, \
                                g_walk_time=new_g_walk_time, \
                                time_until_force=new_time_until_force)
    
    
    # main code
    
    # create new gene tree and simulate its evolution
    gtree = treelib.Tree()
    gtree.make_root()
    gtree.root.dist = 0.0
    gtree.root.data['freq'] = freq
    sim_walk(gtree, stree.root, gtree.root, freq) # should mutate gtree
    
    
    # remove dead branches and single children (inside last method)
    # note that the simplifyRoot argument was added to the treelib methods
    #  so that gtree.root.dist is always equal to 0.0 (and this allows the
    #  root to have a single child)
    #  if this behavior is undesired later, we can simply remove the argument
    #  and the root will be collapsed (and have >0 dist)
    extant_leaves = []
    for leaf in gtree.leaves():
        if leaf.data['freq'] > 0.0:
            extant_leaves.append(leaf.name)
    gtree = treelib.subtree_by_leaf_names(gtree, extant_leaves, \
                                            simplifyRoot=False)
    
    return gtree
Esempio n. 3
0
def sample_locus_tree_hem(stree, popsize, duprate, lossrate,
                          freq=1.0, freqdup=.05, freqloss=.05,
                          steptime=1e6, keep_extinct=False):
    
    """
    Sample a locus tree with birth-death and hemiplasy
    
    
    Runs a relaxed fixation assumption simulation on a species tree.
    Some simplifying assumptions are made for this version of the simulator:
      1) All branches of the species tree have the same population size
      2) All branches of the species tree have the same duplication rate
      3) All branches of the species tree have the same loss rate
      4) All branches of the species tree have the same duplication effect
      5) All branches of the species tree have the same loss effect
      6) All branches of the species tree have the same time between forced
           frequency changes
      7) There is a single allele at the root of the species tree.

    A duplication/loss effect is the change in frequency for either event.
    Appropriate default values for these effects may need to be determined.
    Furture iterations should remove these assumptions by incorporating
    dictionaries to allow values for each branch.

    parameters:
    stree is the initial species tree; it may be mutated by the simulator
    popsize is the population size (assmpt. 1)
    freq is the allele frequency (assmpt. 7)
    duprate is the duplication rate (in events/myr/indiv(?); assmpt. 2)
    lossrate is the loss rate (in events/myr/indiv(?); assmpt. 3)
    freqdup is the duplication effect (assmpt. 4)
    freqloss is the loss effect (assmpt. 5)
    forcetime is the maximum time between frequency changes (assmpt. 6)
    
    Returns the locus tree, as well as extra information
    including a reconciliation dictionary and an events dictionary.
    """
    
    ## sanity checks before running the simulator; may be removed or relaxed
    treelib.assert_tree(stree)
    assert popsize > 0
    assert 0.0 <= freq and freq <= 1.0
    assert duprate >= 0.0
    assert lossrate >= 0.0
    assert 0.0 <= freqdup and freqdup <= 1.0
    assert 0.0 <= freqloss and freqloss <= 1.0
    assert steptime > 0.0

    
    # special case: no duplications or losses
    if duprate == 0.0 and lossrate == 0.0:
        locus_tree = stree.copy()
        recon = phylo.reconcile(locus_tree, stree, lambda x: x)
        events = phylo.label_events(locus_tree, recon)

        return locus_tree, {"recon": recon,
                            "events": events,
                            "daughters": set()}
                                
    
    def event_is_dup(duprate, fullrate):
        return random.random() <= duprate / fullrate

    
    def sim_walk(gtree, snode, gparent, p,
                 s_walk_time=0.0, remaining_steptime=steptime,
                 daughter=False):
        """
        eventlog is a log of events along the gtree branch.
        Each entry has the form
          (time_on_branch, event_type, frequency, species_node),
          
        where
           0.0 <= time_on_branch <= branch_node.dist

        event_type is one of
           {'extinction', 'frequency', 'speciation', duplication',
            'loss', 'root', 'gene'},
            
        where 'root' is a unique event not added during the sim_walk process

        frequency is the branch frequency at the event time

        species_node is the name of the node of the species tree branch in
        which the event occurs
        """

        # create new node
        gnode = treelib.TreeNode(gtree.new_name())
        gtree.add_child(gparent, gnode)
        gnode.data = {"freq": p,
                      "log": []}
        eventlog = gnode.data["log"]
        g_walk_time = 0.0
        if daughter:
            eventlog.append((0.0, 'daughter', freqdup, snode.name))
            
        
        # grow this branch, determine next event
        event = None
        while True:
            if p <= 0.0:
                event = "extinct"
                break
            
            # determine remaing time
            remaining_s_dist = snode.dist - s_walk_time
            remaining_time = min(remaining_steptime, remaining_s_dist)

            # sample next dup/loss event
            eff_duprate = duprate * p / freqdup
            eff_lossrate = lossrate * p / freqloss
            eff_bothrate = eff_duprate + eff_lossrate            
            event_time = stats.exponentialvariate(eff_bothrate)

            # advance times
            time_delta = min(event_time, remaining_time)
            s_walk_time += time_delta
            g_walk_time += time_delta

            # sample new frequency
            p = coal.sample_freq_CDF(p, popsize, time_delta)

            # determine event
            if event_time < remaining_time:
                # dup/loss occurs
                if event_is_dup(eff_duprate, eff_bothrate):
                    # dup, stop growing
                    event = "dup"
                    break
                else:
                    # loss, continue growing
                    event = "loss"
                    
            else:
                if remaining_s_dist < remaining_steptime:
                    # we are at a speciation, stop growing
                    event = "spec"
                    break

            # process step
            if event == "loss":
                # LOSS EVENT
                p = max(p - freqloss, 0.0)
                remaining_steptime -= time_delta
                eventlog.append((g_walk_time, 'loss', p, snode.name))
            else:
                # NEXT TIME STEP
                remaining_steptime = steptime
                eventlog.append((g_walk_time, 'frequency', p, snode.name))
                

        # process event
        if event == "extinct":
            # EXTINCTION EVENT (p <= 0)
            gnode.dist = g_walk_time
            gnode.data['freq'] = 0.0
            eventlog.append((g_walk_time, 'extinction', 0.0, snode.name))

        
        elif event == "spec":
            # SPECIATION EVENT
            gnode.dist = g_walk_time
            gnode.data['freq'] = p
                        
            # add speciation event to event log and
            if snode.is_leaf():
                eventlog.append((g_walk_time, 'gene', p, snode.name))
            else:
                eventlog.append((g_walk_time, 'speciation', p, snode.name))
                for schild in snode.children:
                    sim_walk(gtree, schild, gnode, p)


        elif event == "dup":
            # DUPLICATION EVENT
            gnode.dist = g_walk_time
            gnode.data['freq'] = p
            eventlog.append((g_walk_time, 'duplication', p, snode.name))

            # recurse on mother
            sim_walk(gtree, snode, gnode, p, 
                     s_walk_time=s_walk_time, 
                     remaining_steptime=remaining_steptime)

            # recurse on daughter
            sim_walk(gtree, snode, gnode, freqdup, 
                     s_walk_time=s_walk_time, 
                     remaining_steptime=remaining_steptime,
                     daughter=True)

        else:
            raise Exception("unknown event '%s'" % event)
    
    
    # create new gene tree and simulate its evolution
    gtree = treelib.Tree()
    gtree.make_root()
    gtree.root.dist = 0.0
    gtree.root.data['freq'] = freq
    gtree.root.data['log'] = [(0.0, 'speciation', freq, stree.root.name)]

    # simulate locus tree
    sim_walk(gtree, stree.root.children[0], gtree.root, freq)
    sim_walk(gtree, stree.root.children[1], gtree.root, freq)
    
    
    # remove dead branches and single children
    extant_leaves = [leaf.name for leaf in gtree.leaves()
                     if leaf.data['freq'] > 0.0]
    extinctions = [leaf for leaf in gtree.leaves()
                   if leaf.data['freq'] == 0.0]

    if keep_extinct:
        full_gtree = gtree.copy()
        # do deep copy of data
        for node in full_gtree:
            node2 = gtree.nodes[node.name]
            for key, val in node2.data.items():
                node.data[key] = copy.copy(val)
        
    treelib.subtree_by_leaf_names(gtree, extant_leaves, keep_single=True)
    remove_single_children(gtree)

    # determine extra information (recon, events, daughters)
    extras = generate_extras(stree, gtree)

    if keep_extinct:
        extras["full_locus_tree"] = full_gtree
    
    return gtree, extras