def sim_DLILS_gene_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime): """ Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) dr is the duplication rate (in events/myr/indiv(?); assmpt. 2) lr is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) Update: 30 July 2010 Will return the gene (locus) tree, as well as extra information including a reconciliation dictionary and an events dictionary. """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert dr >= 0.0 assert lr >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert forcetime >= 0.0 if dr + lr <= 0.0: return stree.copy() # no duplications or losses => stree is final tree # note: the use of < instead of <= is intentional # if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0) def event_is_dup(duprate, fullrate): return random.random() < duprate / fullrate def sim_walk(gtree, snode, gnode, p, s_walk_time=0.0, g_walk_time=0.0, \ time_until_force=forcetime, eventlog=[]): ### Most of the variables are obvious from descriptions in sim_tree or similar. ### eventlog is a log of events along the gtree branch; each entry has the form ### (time_on_branch, event_type, frequency, species_node), ### where 0.0 <= time_on_branch <= branch_node.dist ### event_type is one of {'extinction', 'frequency', 'speciation', ### duplication', 'loss', 'root', 'gene'}, where 'root' is a unique event ### not added during the sim_walk process ### frequency is the branch frequency at the event time ### species_node is the name of the node of the species tree branch ### in which the event occurs if p <= 0.0: ## EXTINCTION EVENT # gnode is 'parent' of extinct node # create new_gnode new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = g_walk_time # set new_gnode's frequency new_gnode.data['freq'] = 0.0 gtree.add_child(gnode, new_gnode) # add extinction event to the event log ext_event = (g_walk_time, 'extinction', 0.0, snode.name) eventlog.append(ext_event) # set new_gnode's event log new_gnode.data['log'] = eventlog eventlog = [] # should have no effect; added for debugging on 18 Oct 2010 else: # put everything else in this block to avoid using returns p = min(p, 1.0) # sanity check eff_dr = dr * p # * popsize #?? eff_lr = lr * p # * popsize #?? eff_bothr = eff_dr + eff_lr event_time = stats.exponentialvariate(eff_bothr) remaining_s_dist = snode.dist - s_walk_time if event_time >= min(time_until_force, remaining_s_dist): # do not process D/L event; determine whether at force or speciation if time_until_force < remaining_s_dist: ## FREQUENCY UPDATE EVENT # sample a new frequency (note scaling to years from myr) # edit: not any more newp = coal.sample_freq_CDF(p, popsize, forcetime) # * 1e6) # TODO: if we decide not to reset time_until_force at # speciation events, the newp generation will need to be # altered in some form (probably using a new variable) # update walk times new_s_walk_time = s_walk_time + time_until_force new_g_walk_time = g_walk_time + time_until_force # add frequency event to event log freq_event = (new_g_walk_time, 'frequency', newp, snode.name) eventlog.append(freq_event) # continue the walk with a reset forcetime sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time, \ eventlog=eventlog) eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: ## SPECIATION EVENT # separate into separate root, non-root speciations # if gnode.parent: # gnode not the root if gnode.data['log'][-1][1] != 'root': # sample a new frequency (note scaling to years from myr) # edit: not any more newp = coal.sample_freq_CDF(p, popsize, remaining_s_dist) # * 1e6) # create new_gnode for this event new_gnode = treelib.TreeNode(gtree.new_name()) new_g_walk_time = g_walk_time + remaining_s_dist new_gnode.dist = new_g_walk_time # set new node's frequency new_gnode.data['freq'] = newp gtree.add_child(gnode, new_gnode) # add speciation event to event log and set the new node's log if snode.is_leaf(): gene_event = (new_g_walk_time, 'gene', newp, snode.name) eventlog.append(gene_event) new_gnode.data['log'] = eventlog # end of walk on species branch eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: spec_event = (new_g_walk_time, 'speciation', newp, snode.name) eventlog.append(spec_event) new_gnode.data['log'] = eventlog for schild in snode.children: sim_walk(gtree, schild, new_gnode, newp, eventlog=[]) # TODO: if we decide not to reset time_until_force at # speciation events, this sim_walk call will need updating eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: # gnode is the root spec_event = (0.0, 'speciation', p, snode.name) eventlog = gnode.data['log'] eventlog.append(spec_event) gnode.data['log'] = eventlog # ### debug print # print # print 'adding: ', eventlog # ### end debug for schild in snode.children: sim_walk(gtree, schild, gnode, p, eventlog=[]) eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: # process D/L event # no WF updates for these events (modelling decision) new_s_walk_time = s_walk_time + event_time new_g_walk_time = g_walk_time + event_time new_time_until_force = time_until_force - event_time if event_is_dup(eff_dr, eff_bothr): ## DUPLICATION EVENT # create a node new_gnode for the duplication event new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = new_g_walk_time # set new node's frequency new_gnode.data['freq'] = p gtree.add_child(gnode, new_gnode) # add duplication event to event log and set the new node's log dup_event = (new_g_walk_time, 'duplication', p, snode.name) eventlog.append(dup_event) new_gnode.data['log'] = eventlog # recurse on remainder of original branch sim_walk(gtree, snode, new_gnode, p, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force, \ eventlog=[]) # recurse on dup tree with correct starting frequency sim_walk(gtree, snode, new_gnode, freqdup, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force, \ eventlog=[(0.0,'daughter',freqdup,snode.name)]) # added for daughter detection eventlog = [] # should have no effect; debug add on 18 Oct 2010 else: ## LOSS EVENT newp = max(p - freqloss, 0.0) # sanity check # add loss event to event log loss_event = (new_g_walk_time, 'loss', newp, snode.name) eventlog.append(loss_event) sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time, \ time_until_force=new_time_until_force, \ eventlog=eventlog) eventlog = [] # should have no effect; debug add on 18 Oct 2010 # main code # create new gene tree and simulate its evolution gtree = treelib.Tree() gtree.make_root() gtree.root.dist = 0.0 gtree.root.data['freq'] = freq root_event = (0.0, 'root', freq, stree.root.name) gtree.root.data['log'] = [root_event] sim_walk(gtree, stree.root, gtree.root, freq) # should mutate gtree # # remove dead branches and single children (inside last method) # # note that the simplifyRoot argument was added to the treelib methods # # so that gtree.root.dist is always equal to 0.0 (and this allows the # # root to have a single child) # # if this behavior is undesired later, we can simply remove the argument # # and the root will be collapsed (and have >0 dist) extant_leaves = [] for leaf in gtree.leaves(): if leaf.data['freq'] > 0.0: extant_leaves.append(leaf.name) gtree = treelib.subtree_by_leaf_names(gtree, extant_leaves, keep_single=True) remove_single_children(gtree) # allows for correct logging of events extras = generate_extras(stree, gtree) return gtree, extras
def sim_tree(stree, popsize, freq, dr, lr, freqdup, freqloss, forcetime): """ Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) dr is the duplication rate (in events/myr/indiv(?); assmpt. 2) lr is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert dr >= 0.0 assert lr >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert forcetime >= 0.0 if dr + lr <= 0.0: return stree.copy() # no duplications or losses => stree is final tree # note: the use of < instead of <= is intentional # if lr==0, duprate/fullrate==1, and random() returns from [0.0,1.0) def event_is_dup(duprate, fullrate): return random.random() < duprate / fullrate def sim_walk(gtree, snode, gnode, p, s_walk_time=0.0, g_walk_time=0.0, \ time_until_force=forcetime): # debugprint(" Sim on branch" + str(node.name) + " with frequency " + str(p) + " and walk time " + str(walk_time)) # debugprint(" walking on " + str(gnode.name)) if p <= 0.0: # gnode is 'parent' of extinct node # create new_gnode, set data['freq'] = 0.0 # prune at the end new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = g_walk_time new_gnode.data['freq'] = 0.0 gtree.add_child(gnode, new_gnode) # debugprint(" extinction on " + str(gnode.name)) else: # put everything else in this block to avoid using returns p = min(p, 1.0) # sanity check eff_dr = dr * p # * popsize #?? eff_lr = lr * p # * popsize #?? eff_bothr = eff_dr + eff_lr event_time = stats.exponentialvariate(eff_bothr) remaining_s_dist = snode.dist - s_walk_time if event_time >= min(time_until_force, remaining_s_dist): # do not process D/L event; determine whether at force or speciation if time_until_force < remaining_s_dist: # force new frequency newp = coal.sample_freq_CDF(p, popsize, forcetime * 1e6) # scale forcetime to years (in myr) # debugprint(" Forced new frequency: " + str(newp)) ## TODO: may wish to log newp in node.data new_s_walk_time = s_walk_time + time_until_force new_g_walk_time = g_walk_time + time_until_force sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time) # continue walk with new frequency # increase walk_times accordingly # reset time_until_force to forcetime else: # speciation event newp = coal.sample_freq_CDF(p, popsize, remaining_s_dist * 1e6) # scale remaining time into years (from myr) new_gnode = treelib.TreeNode(gtree.new_name()) new_gnode.dist = g_walk_time + remaining_s_dist new_gnode.data['freq'] = newp # stores frequency of allele at the speciation event gtree.add_child(gnode, new_gnode) # debugprint(" Completed branch; new frequency: " + str(newp)) for schild in snode.children: sim_walk(gtree, schild, new_gnode, newp) # return # shouldn't be necessary else: # process D/L event # no WF updates for these events (modelling decision) new_s_walk_time = s_walk_time + event_time new_g_walk_time = g_walk_time + event_time new_time_until_force = time_until_force - event_time if event_is_dup(eff_dr, eff_bothr): # perform duplication event new_gnode = treelib.TreeNode(gtree.new_name()) # create a node new_gnode for the duplication event new_gnode.dist = new_g_walk_time # set dist to dup new_gnode.data['freq'] = p # set frequency at dup event # debugprint(" Duplication occurred at walk time " + str(new_walk_time)) gtree.add_child(gnode, new_gnode) # debugprint(" starting on orig of " + str(new_gnode.name)) sim_walk(gtree, snode, new_gnode, p, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force) # recurse on remainder of original branch # debugprint(" starting on dup of " + str(new_gnode.name)) sim_walk(gtree, snode, new_gnode, freqdup, \ s_walk_time=new_s_walk_time, \ time_until_force = new_time_until_force) # recurse on dup tree with correct starting frequency # return else: # perform loss event newp = max(p - freqloss, 0.0) # sanity check # debugprint(" Loss occurred at walk time " + str(new_walk_time) + " yielding new frequency " + str(newp)) sim_walk(gtree, snode, gnode, newp, \ s_walk_time=new_s_walk_time, \ g_walk_time=new_g_walk_time, \ time_until_force=new_time_until_force) # main code # create new gene tree and simulate its evolution gtree = treelib.Tree() gtree.make_root() gtree.root.dist = 0.0 gtree.root.data['freq'] = freq sim_walk(gtree, stree.root, gtree.root, freq) # should mutate gtree # remove dead branches and single children (inside last method) # note that the simplifyRoot argument was added to the treelib methods # so that gtree.root.dist is always equal to 0.0 (and this allows the # root to have a single child) # if this behavior is undesired later, we can simply remove the argument # and the root will be collapsed (and have >0 dist) extant_leaves = [] for leaf in gtree.leaves(): if leaf.data['freq'] > 0.0: extant_leaves.append(leaf.name) gtree = treelib.subtree_by_leaf_names(gtree, extant_leaves, \ simplifyRoot=False) return gtree
def sample_locus_tree_hem(stree, popsize, duprate, lossrate, freq=1.0, freqdup=.05, freqloss=.05, steptime=1e6, keep_extinct=False): """ Sample a locus tree with birth-death and hemiplasy Runs a relaxed fixation assumption simulation on a species tree. Some simplifying assumptions are made for this version of the simulator: 1) All branches of the species tree have the same population size 2) All branches of the species tree have the same duplication rate 3) All branches of the species tree have the same loss rate 4) All branches of the species tree have the same duplication effect 5) All branches of the species tree have the same loss effect 6) All branches of the species tree have the same time between forced frequency changes 7) There is a single allele at the root of the species tree. A duplication/loss effect is the change in frequency for either event. Appropriate default values for these effects may need to be determined. Furture iterations should remove these assumptions by incorporating dictionaries to allow values for each branch. parameters: stree is the initial species tree; it may be mutated by the simulator popsize is the population size (assmpt. 1) freq is the allele frequency (assmpt. 7) duprate is the duplication rate (in events/myr/indiv(?); assmpt. 2) lossrate is the loss rate (in events/myr/indiv(?); assmpt. 3) freqdup is the duplication effect (assmpt. 4) freqloss is the loss effect (assmpt. 5) forcetime is the maximum time between frequency changes (assmpt. 6) Returns the locus tree, as well as extra information including a reconciliation dictionary and an events dictionary. """ ## sanity checks before running the simulator; may be removed or relaxed treelib.assert_tree(stree) assert popsize > 0 assert 0.0 <= freq and freq <= 1.0 assert duprate >= 0.0 assert lossrate >= 0.0 assert 0.0 <= freqdup and freqdup <= 1.0 assert 0.0 <= freqloss and freqloss <= 1.0 assert steptime > 0.0 # special case: no duplications or losses if duprate == 0.0 and lossrate == 0.0: locus_tree = stree.copy() recon = phylo.reconcile(locus_tree, stree, lambda x: x) events = phylo.label_events(locus_tree, recon) return locus_tree, {"recon": recon, "events": events, "daughters": set()} def event_is_dup(duprate, fullrate): return random.random() <= duprate / fullrate def sim_walk(gtree, snode, gparent, p, s_walk_time=0.0, remaining_steptime=steptime, daughter=False): """ eventlog is a log of events along the gtree branch. Each entry has the form (time_on_branch, event_type, frequency, species_node), where 0.0 <= time_on_branch <= branch_node.dist event_type is one of {'extinction', 'frequency', 'speciation', duplication', 'loss', 'root', 'gene'}, where 'root' is a unique event not added during the sim_walk process frequency is the branch frequency at the event time species_node is the name of the node of the species tree branch in which the event occurs """ # create new node gnode = treelib.TreeNode(gtree.new_name()) gtree.add_child(gparent, gnode) gnode.data = {"freq": p, "log": []} eventlog = gnode.data["log"] g_walk_time = 0.0 if daughter: eventlog.append((0.0, 'daughter', freqdup, snode.name)) # grow this branch, determine next event event = None while True: if p <= 0.0: event = "extinct" break # determine remaing time remaining_s_dist = snode.dist - s_walk_time remaining_time = min(remaining_steptime, remaining_s_dist) # sample next dup/loss event eff_duprate = duprate * p / freqdup eff_lossrate = lossrate * p / freqloss eff_bothrate = eff_duprate + eff_lossrate event_time = stats.exponentialvariate(eff_bothrate) # advance times time_delta = min(event_time, remaining_time) s_walk_time += time_delta g_walk_time += time_delta # sample new frequency p = coal.sample_freq_CDF(p, popsize, time_delta) # determine event if event_time < remaining_time: # dup/loss occurs if event_is_dup(eff_duprate, eff_bothrate): # dup, stop growing event = "dup" break else: # loss, continue growing event = "loss" else: if remaining_s_dist < remaining_steptime: # we are at a speciation, stop growing event = "spec" break # process step if event == "loss": # LOSS EVENT p = max(p - freqloss, 0.0) remaining_steptime -= time_delta eventlog.append((g_walk_time, 'loss', p, snode.name)) else: # NEXT TIME STEP remaining_steptime = steptime eventlog.append((g_walk_time, 'frequency', p, snode.name)) # process event if event == "extinct": # EXTINCTION EVENT (p <= 0) gnode.dist = g_walk_time gnode.data['freq'] = 0.0 eventlog.append((g_walk_time, 'extinction', 0.0, snode.name)) elif event == "spec": # SPECIATION EVENT gnode.dist = g_walk_time gnode.data['freq'] = p # add speciation event to event log and if snode.is_leaf(): eventlog.append((g_walk_time, 'gene', p, snode.name)) else: eventlog.append((g_walk_time, 'speciation', p, snode.name)) for schild in snode.children: sim_walk(gtree, schild, gnode, p) elif event == "dup": # DUPLICATION EVENT gnode.dist = g_walk_time gnode.data['freq'] = p eventlog.append((g_walk_time, 'duplication', p, snode.name)) # recurse on mother sim_walk(gtree, snode, gnode, p, s_walk_time=s_walk_time, remaining_steptime=remaining_steptime) # recurse on daughter sim_walk(gtree, snode, gnode, freqdup, s_walk_time=s_walk_time, remaining_steptime=remaining_steptime, daughter=True) else: raise Exception("unknown event '%s'" % event) # create new gene tree and simulate its evolution gtree = treelib.Tree() gtree.make_root() gtree.root.dist = 0.0 gtree.root.data['freq'] = freq gtree.root.data['log'] = [(0.0, 'speciation', freq, stree.root.name)] # simulate locus tree sim_walk(gtree, stree.root.children[0], gtree.root, freq) sim_walk(gtree, stree.root.children[1], gtree.root, freq) # remove dead branches and single children extant_leaves = [leaf.name for leaf in gtree.leaves() if leaf.data['freq'] > 0.0] extinctions = [leaf for leaf in gtree.leaves() if leaf.data['freq'] == 0.0] if keep_extinct: full_gtree = gtree.copy() # do deep copy of data for node in full_gtree: node2 = gtree.nodes[node.name] for key, val in node2.data.items(): node.data[key] = copy.copy(val) treelib.subtree_by_leaf_names(gtree, extant_leaves, keep_single=True) remove_single_children(gtree) # determine extra information (recon, events, daughters) extras = generate_extras(stree, gtree) if keep_extinct: extras["full_locus_tree"] = full_gtree return gtree, extras