def get_seq_at_step(self, step_idx, flanked=False): """ @return the nucleotide sequence after the `step_idx`-th mutation """ intermediate_seq = self.obs_seq_mutation.start_seq for i in range(step_idx): mut_pos = self.mutation_order[i] intermediate_seq = mutate_string( intermediate_seq, mut_pos, self.obs_seq_mutation.end_seq[mut_pos]) if flanked: return self.obs_seq_mutation.left_flank + intermediate_seq + self.obs_seq_mutation.right_flank else: return intermediate_seq
def create_for_mutation_steps( self, seq_mut_order, left_update_region, right_update_region, ): """ Calculate the feature values for the mutation steps Only returns the deltas at each mutation step @param seq_mut_order: ImputedSequenceMutations @return list of FeatureMutationStep (correponding to after first mutation to before last mutation) """ feat_mutation_steps = [] old_mutation_pos = None intermediate_seq = seq_mut_order.obs_seq_mutation.start_seq_with_flanks feat_dict_prev = dict() already_mutated_pos = set() for mutation_step, mutation_pos in enumerate( seq_mut_order.mutation_order): feat_dict_curr, feat_dict_future = self.update_mutation_step( mutation_step, mutation_pos, old_mutation_pos, seq_mut_order, intermediate_seq, already_mutated_pos, left_update_region=left_update_region, right_update_region=right_update_region, ) mutating_pos_feat_idx = self._get_mutating_pos_feat_idx( mutation_pos, intermediate_seq, seq_mut_order.obs_seq_mutation) feat_mutation_steps.append( MultiFeatureMutationStep( mutating_pos_feat_idx, mutation_pos, neighbors_feat_old=feat_dict_prev, neighbors_feat_new=feat_dict_curr, )) # Apply mutation curr_mutation_pos = mutation_pos + seq_mut_order.obs_seq_mutation.left_flank_len intermediate_seq = mutate_string( intermediate_seq, curr_mutation_pos, seq_mut_order.obs_seq_mutation. end_seq_with_flanks[curr_mutation_pos], ) already_mutated_pos.add(mutation_pos) feat_dict_prev = feat_dict_future old_mutation_pos = mutation_pos if len(feat_mutation_steps ) != seq_mut_order.obs_seq_mutation.num_mutations: raise AssertionError( "%d vs %d" % (len(feat_mutation_steps), seq_mut_order.obs_seq_mutation.num_mutations)) return feat_mutation_steps
def create_remaining_mutation_steps( self, seq_mut_order, update_step_start, left_update_region, right_update_region, ): """ Calculate the feature values for the mutation steps starting the the `update_step_start`-th step Only returns the deltas at each mutation step @param seq_mut_order: ImputedSequenceMutations @param update_step_start: which mutation step to start calculating features for @return list of FeatureMutationStep (correponding to after `update_step_start`-th mutation to before last mutation) """ feat_mutation_steps = [] old_mutation_pos = None feat_dict_prev = dict() flanked_seq = seq_mut_order.get_seq_at_step(update_step_start, flanked=True) already_mutated_pos = set( seq_mut_order.mutation_order[:update_step_start]) for mutation_step in range( update_step_start, seq_mut_order.obs_seq_mutation.num_mutations): mutation_pos = seq_mut_order.mutation_order[mutation_step] feat_dict_curr, feat_dict_future = self.update_mutation_step( mutation_step, mutation_pos, old_mutation_pos, seq_mut_order, flanked_seq, already_mutated_pos, left_update_region=left_update_region, right_update_region=right_update_region, ) mutating_pos_feat_idx = self._get_mutating_pos_feat_idx( mutation_pos, flanked_seq, seq_mut_order.obs_seq_mutation) feat_mutation_steps.append( MultiFeatureMutationStep( mutating_pos_feat_idx, mutation_pos, neighbors_feat_old=feat_dict_prev, neighbors_feat_new=feat_dict_curr, )) # Apply mutation curr_mutation_pos = mutation_pos + seq_mut_order.obs_seq_mutation.left_flank_len flanked_seq = mutate_string( flanked_seq, curr_mutation_pos, seq_mut_order.obs_seq_mutation. end_seq_with_flanks[curr_mutation_pos], ) already_mutated_pos.add(mutation_pos) feat_dict_prev = feat_dict_future old_mutation_pos = mutation_pos return feat_mutation_steps
def get_shuffled_mutation_steps_delta( self, seq_mut_order, update_step, flanked_seq, already_mutated_pos, left_update_region, right_update_region, ): """ @param seq_mut_order: a list of the positions in the mutation order @param update_step: the index of the mutation step being shuffled with the (`update_step` + 1)-th step @param flanked_seq: must be a FLANKED sequence @param already_mutated_pos: set of positions that already mutated - dont calculate feature vals for these @return a tuple with the feature index at this mutation step and the feature mutation step of the next mutation step """ feat_mutation_steps = [] first_mutation_pos = seq_mut_order.mutation_order[update_step] second_mutation_pos = seq_mut_order.mutation_order[update_step + 1] _, feat_dict_future = self.update_mutation_step( update_step, first_mutation_pos, None, seq_mut_order, flanked_seq, already_mutated_pos, left_update_region=left_update_region, right_update_region=right_update_region, ) first_mut_pos_feat_idx = self._get_mutating_pos_feat_idx( first_mutation_pos, flanked_seq, seq_mut_order.obs_seq_mutation) # Apply mutation curr_mutation_pos = first_mutation_pos + seq_mut_order.obs_seq_mutation.left_flank_len flanked_seq = mutate_string( flanked_seq, curr_mutation_pos, seq_mut_order.obs_seq_mutation. end_seq_with_flanks[curr_mutation_pos], ) feat_dict_curr, _ = self.update_mutation_step( update_step + 1, second_mutation_pos, first_mutation_pos, seq_mut_order, flanked_seq, already_mutated_pos, calc_future_dict=False, left_update_region=left_update_region, right_update_region=right_update_region, ) second_mut_pos_feat_idx = self._get_mutating_pos_feat_idx( second_mutation_pos, flanked_seq, seq_mut_order.obs_seq_mutation) return first_mut_pos_feat_idx, MultiFeatureMutationStep( second_mut_pos_feat_idx, second_mutation_pos, neighbors_feat_old=feat_dict_future, neighbors_feat_new=feat_dict_curr, )
def simulate(self, start_seq, left_flank=None, right_flank=None, censoring_time=None, percent_mutated=None, with_replacement=False, obs_seq_mutation=None): """ @param start_seq: string for the original sequence; includes flanks unless they are provided by left_flank/right_flank @param left_flank: the left flank @param right_flank: the right flank @param censoring_time: how long to mutate the sequence for @param percent_mutated: percent of sequence to mutated @param with_replacement: True = a position can mutate multiple times, False = a position can mutate at most once @return FullSequenceMutations, ending sequence and entire history of mutations """ mutations = [] if left_flank is None and right_flank is None: left_flank = start_seq[:self.feature_generator. max_left_motif_flank_len] right_flank = start_seq[len(start_seq) - self.feature_generator. max_right_motif_flank_len:] start_seq = start_seq[self.feature_generator. max_left_motif_flank_len:len(start_seq) - self.feature_generator. max_right_motif_flank_len] pos_to_mutate = set(range(len(start_seq))) intermediate_seq = start_seq last_mutate_time = 0 while len(pos_to_mutate) > 0: # TODO: For speedup, we don't need to recalculate all the features. if with_replacement: feature_vec_dict = self.feature_generator.create_for_sequence( intermediate_seq, left_flank, right_flank, obs_seq_mutation=obs_seq_mutation) else: feature_vec_dict = self.feature_generator.create_for_sequence( intermediate_seq, left_flank, right_flank, do_feat_vec_pos=pos_to_mutate, obs_seq_mutation=obs_seq_mutation) mutate_time_delta, mutate_pos, nucleotide_target = self._sample_mutation( feature_vec_dict, intermediate_seq, pos_to_mutate) mutate_time = last_mutate_time + mutate_time_delta if censoring_time is not None and censoring_time < mutate_time: break elif percent_mutated is not None and len( mutations) >= percent_mutated * len(start_seq): break last_mutate_time = mutate_time if not with_replacement: pos_to_mutate.remove(mutate_pos) mutations.append( MutationEvent( mutate_time, mutate_pos, nucleotide_target, )) intermediate_seq = mutate_string(intermediate_seq, mutate_pos, nucleotide_target) return FullSequenceMutations( start_seq, intermediate_seq, left_flank, right_flank, mutations, )