def get_one_IGC_event(self, seq, sub_IGC_init_Q, sub_IGC_tract_Q, sub_total_IGC_init_Q, ordered_orlg, display): np.fill_diagonal(sub_IGC_tract_Q, 0.0) # fill diagonal entries with 0, just in case.. # sample an IGC event orlg pair IGC_pos = draw_from_distribution(sub_total_IGC_init_Q.ravel(), 1, range(sub_total_IGC_init_Q.size)) orlg_from_num = int(floor(IGC_pos / sub_total_IGC_init_Q.ndim)) orlg_to_num = IGC_pos - orlg_from_num * sub_total_IGC_init_Q.ndim orlg_from = ordered_orlg[orlg_from_num] orlg_to = ordered_orlg[orlg_to_num] # now sample a starting pos and tract length tract_p = sub_IGC_tract_Q[orlg_from_num, orlg_to_num] init_prob_array = np.array([1.0 / tract_p] + [1.0] * (self.nsites / 3 - 1)) start_pos = draw_from_distribution( init_prob_array / sum(init_prob_array), 1, range(len(init_prob_array))) * 3 tract_length = np.random.geometric(tract_p, 1)[0] * 3 stop_pos = start_pos + tract_length - 1 # tract_length is a positive integer if stop_pos > self.nsites - 1: stop_pos = self.nsites - 1 #print start_pos, stop_pos, self.nsites seq, IGC_info = self.IGC_copy(start_pos, stop_pos, orlg_from, orlg_to, seq, display, tract_length) return seq, IGC_info
def get_one_point_mutation( self, seq, seq_rate_dict, display): # modified from IGCSimulation.IGCSimulator # only allow all sequences with same length assert (len(set([len(seq_rate_dict[orlg]) for orlg in seq_rate_dict])) == 1) orlg_group = sorted(seq_rate_dict.keys()) # Now concatenate all rates to one giant list to draw from distribution # Use a dumb but safe way rather than comprehension concatenated_rate = list() for orlg in orlg_group: concatenated_rate.extend(seq_rate_dict[orlg]) concatenated_rate = np.array(concatenated_rate) / sum( concatenated_rate) # Now sample a point mutation position mut_pos = draw_from_distribution(concatenated_rate, 1, range(len(concatenated_rate))) mut_paralog_num = int( floor(mut_pos / len(seq_rate_dict[orlg_group[0]]))) mut_paralog = orlg_group[mut_paralog_num] seq_pos = mut_pos - mut_paralog_num * len(seq_rate_dict[orlg_group[0]]) # Now add in data_type difference: codon/nucleotide if self.PMModel.data_type == 'cd': states = self.PMModel.codon_nonstop translated_seq = [ ''.join(seq[mut_paralog][i:i + 3]) for i in range(0, len(seq[mut_paralog]), 3) ] else: states = 'ACGT' translated_seq = seq[mut_paralog] # Now perform point mutation at the position old_state = translated_seq[seq_pos] prob = np.array(self.PMModel.Q_mut[states.index(old_state), :]) new_state = states[draw_from_distribution(prob / sum(prob), 1, range(len(prob)))] translated_seq[seq_pos] = new_state seq[mut_paralog] = [i for i in ''.join(translated_seq)] # TODO: implement log # mutation_orlg, mut_pos, old_state, new_state mutation_info = [str(mut_paralog), str(seq_pos), old_state, new_state] if display: print ' '.join(mutation_info) return seq, mutation_info
def sim_root(self): root_name = self.tree.phylo_tree.root.name root_conf = self.tree.node_to_conf[root_name] root_orlg = divide_configuration(root_conf) self.node_to_seq[root_name] = dict() if self.PMModel.name == 'HKY': distn = [self.PMModel.parameters['Pi_' + nt] for nt in 'ACGT'] for orlg in root_orlg['loc']: seq = draw_from_distribution(distn, self.nsites, 'ACGT') self.node_to_seq[root_name][orlg] = seq elif self.PMModel.name == 'MG94': distn = [ reduce(mul, [self.PMModel.parameters['Pi_' + b] for b in codon], 1) for codon in self.PMModel.codon_nonstop ] distn = np.array(distn) / sum(distn) for orlg in root_orlg['loc']: seq = draw_from_distribution(distn, self.nsites / 3, self.PMModel.codon_nonstop) self.node_to_seq[root_name][orlg] = [i for i in ''.join(seq)]
def sim_one_branch(self, edge, display): # First, make sure this branch is not simulated assert (edge[0] in self.node_to_seq and not edge[1] in self.node_to_seq) blen = self.tree.edge_to_blen[edge] starting_seq = self.node_to_seq[edge[0]] conf = self.tree.node_to_conf[edge[0]] branch_orlg = divide_configuration(conf) current_seq = deepcopy( starting_seq ) # it's passed on to the next node, need a new allocation of memory branch_orlg = divide_configuration(conf) #assert(all([orlg in branch_orlg['loc'] for orlg in starting_seq.keys()])) # Get sub IGC init matrix from ordered_orlg = sorted(branch_orlg['loc']) if len(ordered_orlg) == 1: Total_IGC_init_rate = 0.0 else: branch_IGC_init_Q = np.zeros( (len(ordered_orlg), len(ordered_orlg)), dtype=np.floating) branch_IGC_tract_Q = np.zeros( (len(ordered_orlg), len(ordered_orlg)), dtype=np.floating) Total_IGC_init_Q = np.zeros((len(ordered_orlg), len(ordered_orlg)), dtype=np.floating) for i in range(len(ordered_orlg)): for j in range(len(ordered_orlg)): branch_IGC_init_Q[i, j] = self.IGCModel.Q_init[ ordered_orlg[i], ordered_orlg[j]] branch_IGC_tract_Q[i, j] = self.IGCModel.Q_tract[ ordered_orlg[i], ordered_orlg[j]] if i != j: if branch_IGC_tract_Q[i, j] != 0: Total_IGC_init_Q[i, j] = branch_IGC_init_Q[ i, j] * (self.nsites / 3 - 1 + 1.0 / branch_IGC_tract_Q[i, j]) IGC_init_rate_diag = branch_IGC_init_Q.sum(axis=1) # row sum Total_IGC_init_rate = Total_IGC_init_Q.sum() sub_IGC_tract_Q = branch_IGC_tract_Q sub_total_IGC_init_Q = Total_IGC_init_Q sub_IGC_init_Q = branch_IGC_init_Q cummulate_time = 0.0 while (cummulate_time < blen): # Now sample exponential distributed waiting time for next event # point mutation or IGC event # need to update total point mutation rate with every new event # no need to update total IGC rate on the same branch since it's modeled as context independent seq_rate_dict, Total_PM_rate = self.get_mutation_rate(current_seq) Total_rate = Total_PM_rate + Total_IGC_init_rate cummulate_time += np.random.exponential(1.0 / Total_rate) if display: print cummulate_time if cummulate_time > blen: break else: # Now decide whether it's a point mutation or IGC event event = draw_from_distribution( np.array([Total_PM_rate, Total_IGC_init_rate]) / Total_rate, 1, range(2)) if event == 0: # It's a point mutation event current_seq, mutation_info = self.get_one_point_mutation( current_seq, seq_rate_dict, display) to_write_info = ['_'.join(edge), str(cummulate_time)] + mutation_info self.append_to_log_file(to_write_info, 'PM') elif event == 1: # It's an IGC event current_seq, IGC_info = self.get_one_IGC_event( current_seq, branch_IGC_init_Q, branch_IGC_tract_Q, Total_IGC_init_Q, ordered_orlg, display) to_write_info = ['_'.join(edge), str(cummulate_time)] + IGC_info self.append_to_log_file(to_write_info, 'IGC') else: # draw from distribution failure assert (False) # Now need to pass the seq to new node # need to consider gene duplication loss events here self.pass_seq_to_node(current_seq, edge)