Ejemplo n.º 1
0
    def get_one_IGC_event(self, seq, sub_IGC_init_Q, sub_IGC_tract_Q,
                          sub_total_IGC_init_Q, ordered_orlg, display):
        np.fill_diagonal(sub_IGC_tract_Q,
                         0.0)  # fill diagonal entries with 0, just in case..

        # sample an IGC event orlg pair
        IGC_pos = draw_from_distribution(sub_total_IGC_init_Q.ravel(), 1,
                                         range(sub_total_IGC_init_Q.size))
        orlg_from_num = int(floor(IGC_pos / sub_total_IGC_init_Q.ndim))
        orlg_to_num = IGC_pos - orlg_from_num * sub_total_IGC_init_Q.ndim
        orlg_from = ordered_orlg[orlg_from_num]
        orlg_to = ordered_orlg[orlg_to_num]

        # now sample a starting pos and tract length

        tract_p = sub_IGC_tract_Q[orlg_from_num, orlg_to_num]
        init_prob_array = np.array([1.0 / tract_p] + [1.0] *
                                   (self.nsites / 3 - 1))
        start_pos = draw_from_distribution(
            init_prob_array / sum(init_prob_array), 1,
            range(len(init_prob_array))) * 3
        tract_length = np.random.geometric(tract_p, 1)[0] * 3
        stop_pos = start_pos + tract_length - 1  # tract_length is a positive integer
        if stop_pos > self.nsites - 1:
            stop_pos = self.nsites - 1
        #print start_pos, stop_pos, self.nsites
        seq, IGC_info = self.IGC_copy(start_pos, stop_pos, orlg_from, orlg_to,
                                      seq, display, tract_length)

        return seq, IGC_info
Ejemplo n.º 2
0
    def get_one_point_mutation(
            self, seq, seq_rate_dict,
            display):  # modified from IGCSimulation.IGCSimulator
        # only allow all sequences with same length
        assert (len(set([len(seq_rate_dict[orlg])
                         for orlg in seq_rate_dict])) == 1)
        orlg_group = sorted(seq_rate_dict.keys())
        # Now concatenate all rates to one giant list to draw from distribution
        # Use a dumb but safe way rather than comprehension
        concatenated_rate = list()
        for orlg in orlg_group:
            concatenated_rate.extend(seq_rate_dict[orlg])
        concatenated_rate = np.array(concatenated_rate) / sum(
            concatenated_rate)

        # Now sample a point mutation position
        mut_pos = draw_from_distribution(concatenated_rate, 1,
                                         range(len(concatenated_rate)))
        mut_paralog_num = int(
            floor(mut_pos / len(seq_rate_dict[orlg_group[0]])))
        mut_paralog = orlg_group[mut_paralog_num]
        seq_pos = mut_pos - mut_paralog_num * len(seq_rate_dict[orlg_group[0]])

        # Now add in data_type difference: codon/nucleotide
        if self.PMModel.data_type == 'cd':
            states = self.PMModel.codon_nonstop
            translated_seq = [
                ''.join(seq[mut_paralog][i:i + 3])
                for i in range(0, len(seq[mut_paralog]), 3)
            ]
        else:
            states = 'ACGT'
            translated_seq = seq[mut_paralog]

        # Now perform point mutation at the position
        old_state = translated_seq[seq_pos]
        prob = np.array(self.PMModel.Q_mut[states.index(old_state), :])
        new_state = states[draw_from_distribution(prob / sum(prob), 1,
                                                  range(len(prob)))]
        translated_seq[seq_pos] = new_state
        seq[mut_paralog] = [i for i in ''.join(translated_seq)]

        # TODO: implement log
        # mutation_orlg, mut_pos, old_state, new_state
        mutation_info = [str(mut_paralog), str(seq_pos), old_state, new_state]
        if display:
            print ' '.join(mutation_info)

        return seq, mutation_info
Ejemplo n.º 3
0
    def sim_root(self):
        root_name = self.tree.phylo_tree.root.name
        root_conf = self.tree.node_to_conf[root_name]
        root_orlg = divide_configuration(root_conf)
        self.node_to_seq[root_name] = dict()

        if self.PMModel.name == 'HKY':
            distn = [self.PMModel.parameters['Pi_' + nt] for nt in 'ACGT']
            for orlg in root_orlg['loc']:
                seq = draw_from_distribution(distn, self.nsites, 'ACGT')
                self.node_to_seq[root_name][orlg] = seq
        elif self.PMModel.name == 'MG94':
            distn = [
                reduce(mul,
                       [self.PMModel.parameters['Pi_' + b] for b in codon], 1)
                for codon in self.PMModel.codon_nonstop
            ]
            distn = np.array(distn) / sum(distn)
            for orlg in root_orlg['loc']:
                seq = draw_from_distribution(distn, self.nsites / 3,
                                             self.PMModel.codon_nonstop)
                self.node_to_seq[root_name][orlg] = [i for i in ''.join(seq)]
Ejemplo n.º 4
0
    def sim_one_branch(self, edge, display):
        # First, make sure this branch is not simulated
        assert (edge[0] in self.node_to_seq
                and not edge[1] in self.node_to_seq)
        blen = self.tree.edge_to_blen[edge]
        starting_seq = self.node_to_seq[edge[0]]
        conf = self.tree.node_to_conf[edge[0]]
        branch_orlg = divide_configuration(conf)

        current_seq = deepcopy(
            starting_seq
        )  # it's passed on to the next node, need a new allocation of memory
        branch_orlg = divide_configuration(conf)
        #assert(all([orlg in branch_orlg['loc'] for orlg in starting_seq.keys()]))

        # Get sub IGC init matrix from
        ordered_orlg = sorted(branch_orlg['loc'])
        if len(ordered_orlg) == 1:
            Total_IGC_init_rate = 0.0
        else:
            branch_IGC_init_Q = np.zeros(
                (len(ordered_orlg), len(ordered_orlg)), dtype=np.floating)
            branch_IGC_tract_Q = np.zeros(
                (len(ordered_orlg), len(ordered_orlg)), dtype=np.floating)
            Total_IGC_init_Q = np.zeros((len(ordered_orlg), len(ordered_orlg)),
                                        dtype=np.floating)
            for i in range(len(ordered_orlg)):
                for j in range(len(ordered_orlg)):
                    branch_IGC_init_Q[i, j] = self.IGCModel.Q_init[
                        ordered_orlg[i], ordered_orlg[j]]
                    branch_IGC_tract_Q[i, j] = self.IGCModel.Q_tract[
                        ordered_orlg[i], ordered_orlg[j]]
                    if i != j:
                        if branch_IGC_tract_Q[i, j] != 0:
                            Total_IGC_init_Q[i, j] = branch_IGC_init_Q[
                                i, j] * (self.nsites / 3 - 1 +
                                         1.0 / branch_IGC_tract_Q[i, j])

            IGC_init_rate_diag = branch_IGC_init_Q.sum(axis=1)  # row sum
            Total_IGC_init_rate = Total_IGC_init_Q.sum()

            sub_IGC_tract_Q = branch_IGC_tract_Q
            sub_total_IGC_init_Q = Total_IGC_init_Q
            sub_IGC_init_Q = branch_IGC_init_Q

        cummulate_time = 0.0

        while (cummulate_time < blen):
            # Now sample exponential distributed waiting time for next event
            # point mutation or IGC event
            # need to update total point mutation rate with every new event
            # no need to update total IGC rate on the same branch since it's modeled as context independent

            seq_rate_dict, Total_PM_rate = self.get_mutation_rate(current_seq)
            Total_rate = Total_PM_rate + Total_IGC_init_rate

            cummulate_time += np.random.exponential(1.0 / Total_rate)
            if display:
                print cummulate_time

            if cummulate_time > blen:
                break
            else:
                # Now decide whether it's a point mutation or IGC event
                event = draw_from_distribution(
                    np.array([Total_PM_rate, Total_IGC_init_rate]) /
                    Total_rate, 1, range(2))

                if event == 0:
                    # It's a point mutation event
                    current_seq, mutation_info = self.get_one_point_mutation(
                        current_seq, seq_rate_dict, display)
                    to_write_info = ['_'.join(edge),
                                     str(cummulate_time)] + mutation_info
                    self.append_to_log_file(to_write_info, 'PM')

                elif event == 1:
                    # It's an IGC event
                    current_seq, IGC_info = self.get_one_IGC_event(
                        current_seq, branch_IGC_init_Q, branch_IGC_tract_Q,
                        Total_IGC_init_Q, ordered_orlg, display)
                    to_write_info = ['_'.join(edge),
                                     str(cummulate_time)] + IGC_info
                    self.append_to_log_file(to_write_info, 'IGC')
                else:
                    # draw from distribution failure
                    assert (False)

        # Now need to pass the seq to new node
        # need to consider gene duplication loss events here
        self.pass_seq_to_node(current_seq, edge)