def pass_seq_to_node(self, seq, edge): father_node = edge[0] child_node = edge[1] assert (self.tree.find_parent_clade(child_node).name == father_node ) # make sure it's as expected father_orlg = divide_configuration(self.tree.node_to_conf[father_node]) child_orlg = divide_configuration(self.tree.node_to_conf[child_node]) if set(father_orlg['loc']) == set(child_orlg['loc']): self.node_to_seq[child_node] = seq else: father_changed_orlg = set(father_orlg['loc']) - set( child_orlg['loc']) child_changed_orlg = set(child_orlg['loc']) - set( father_orlg['loc']) assert ( len(father_changed_orlg) == 1 ) # only allow one gene to duplicate or lose for now, for tandem duplication, should allow multiple if len(child_changed_orlg) == 2: # duplication event give_birth_orlg = list(father_changed_orlg)[0] assert (set(self.tree.dup_events[give_birth_orlg]) == set( child_changed_orlg)) for new_orlg in child_changed_orlg: seq[new_orlg] = deepcopy(seq[give_birth_orlg]) seq.pop(give_birth_orlg) elif len(child_changed_orlg) == 0: # a gene loss event lost_orlg = list(father_changed_orlg)[0] seq.pop(lost_orlg) else: assert (False) # should not come to this case self.node_to_seq[child_node] = seq
def sim_root(self): root_name = self.tree.phylo_tree.root.name root_conf = self.tree.node_to_conf[root_name] root_orlg = divide_configuration(root_conf) self.node_to_seq[root_name] = dict() if self.PMModel.name == 'HKY': distn = [self.PMModel.parameters['Pi_' + nt] for nt in 'ACGT'] for orlg in root_orlg['loc']: seq = draw_from_distribution(distn, self.nsites, 'ACGT') self.node_to_seq[root_name][orlg] = seq elif self.PMModel.name == 'MG94': distn = [ reduce(mul, [self.PMModel.parameters['Pi_' + b] for b in codon], 1) for codon in self.PMModel.codon_nonstop ] distn = np.array(distn) / sum(distn) for orlg in root_orlg['loc']: seq = draw_from_distribution(distn, self.nsites / 3, self.PMModel.codon_nonstop) self.node_to_seq[root_name][orlg] = [i for i in ''.join(seq)]
def sim_one_branch(self, edge, display): # First, make sure this branch is not simulated assert (edge[0] in self.node_to_seq and not edge[1] in self.node_to_seq) blen = self.tree.edge_to_blen[edge] starting_seq = self.node_to_seq[edge[0]] conf = self.tree.node_to_conf[edge[0]] branch_orlg = divide_configuration(conf) current_seq = deepcopy( starting_seq ) # it's passed on to the next node, need a new allocation of memory branch_orlg = divide_configuration(conf) #assert(all([orlg in branch_orlg['loc'] for orlg in starting_seq.keys()])) # Get sub IGC init matrix from ordered_orlg = sorted(branch_orlg['loc']) if len(ordered_orlg) == 1: Total_IGC_init_rate = 0.0 else: branch_IGC_init_Q = np.zeros( (len(ordered_orlg), len(ordered_orlg)), dtype=np.floating) branch_IGC_tract_Q = np.zeros( (len(ordered_orlg), len(ordered_orlg)), dtype=np.floating) Total_IGC_init_Q = np.zeros((len(ordered_orlg), len(ordered_orlg)), dtype=np.floating) for i in range(len(ordered_orlg)): for j in range(len(ordered_orlg)): branch_IGC_init_Q[i, j] = self.IGCModel.Q_init[ ordered_orlg[i], ordered_orlg[j]] branch_IGC_tract_Q[i, j] = self.IGCModel.Q_tract[ ordered_orlg[i], ordered_orlg[j]] if i != j: if branch_IGC_tract_Q[i, j] != 0: Total_IGC_init_Q[i, j] = branch_IGC_init_Q[ i, j] * (self.nsites / 3 - 1 + 1.0 / branch_IGC_tract_Q[i, j]) IGC_init_rate_diag = branch_IGC_init_Q.sum(axis=1) # row sum Total_IGC_init_rate = Total_IGC_init_Q.sum() sub_IGC_tract_Q = branch_IGC_tract_Q sub_total_IGC_init_Q = Total_IGC_init_Q sub_IGC_init_Q = branch_IGC_init_Q cummulate_time = 0.0 while (cummulate_time < blen): # Now sample exponential distributed waiting time for next event # point mutation or IGC event # need to update total point mutation rate with every new event # no need to update total IGC rate on the same branch since it's modeled as context independent seq_rate_dict, Total_PM_rate = self.get_mutation_rate(current_seq) Total_rate = Total_PM_rate + Total_IGC_init_rate cummulate_time += np.random.exponential(1.0 / Total_rate) if display: print cummulate_time if cummulate_time > blen: break else: # Now decide whether it's a point mutation or IGC event event = draw_from_distribution( np.array([Total_PM_rate, Total_IGC_init_rate]) / Total_rate, 1, range(2)) if event == 0: # It's a point mutation event current_seq, mutation_info = self.get_one_point_mutation( current_seq, seq_rate_dict, display) to_write_info = ['_'.join(edge), str(cummulate_time)] + mutation_info self.append_to_log_file(to_write_info, 'PM') elif event == 1: # It's an IGC event current_seq, IGC_info = self.get_one_IGC_event( current_seq, branch_IGC_init_Q, branch_IGC_tract_Q, Total_IGC_init_Q, ordered_orlg, display) to_write_info = ['_'.join(edge), str(cummulate_time)] + IGC_info self.append_to_log_file(to_write_info, 'IGC') else: # draw from distribution failure assert (False) # Now need to pass the seq to new node # need to consider gene duplication loss events here self.pass_seq_to_node(current_seq, edge)