def translate_with_fs(self, frameshifts=None): # frameshifts is a dict in {pos: Variant} form. NOT VariantSet! We are translating # with a particular FS combination and NOT calculating possible combinations here. if frameshifts is None: frameshifts = [] else: frameshifts = sorted(frameshifts) # should be already sorted, but... # the number of bases gained or lost by each frameshift. Positive: gain, negative: lost fs_shifts = [(fpos, fpos[0] - fpos[1] + len(fsvar)) for fpos, fsvar in frameshifts] def reposition(orig_pos): start, stop = orig_pos new_start, new_stop = start, stop for (fs_start, fs_stop), fs_shift in fs_shifts: if fs_start <= start < fs_stop or fs_start < stop <= fs_stop: warnings.warn('Watch out, variant inside frameshift! We\'re not ready to handle ' 'that yet. %s, (%d-%d)' % (self.id, fs_start, fs_stop)) if start >= fs_stop: # frameshift happened before variant, so variant shifts new_start += fs_shift new_stop += fs_shift return new_start, new_stop fs_positions = [] new_seq = Seq('', generic_nucleotide) original_seq = self.sequence[self.cds[0]:] next_start = 0 for (fs_start, fs_stop), fs_var in frameshifts: new_seq += original_seq[next_start:fs_start] fs_positions.append(len(new_seq)/3) # register first AA position that current FS affects new_seq += fs_var.sequence next_start = fs_stop else: new_seq += original_seq[next_start:] protein = Protein(new_seq.translate(), self) # now with the new sequence created it's time to translate non-FS variants. Since the frameshifts # moved their relative positions around, we have to use their updated locations. new_variantsets = {} for (start, stop), vset in {reposition(vpos): vset for vpos, vset in self.variantsets.iteritems()}.iteritems(): cstart = start - (start % 3) # codon start cstop = (stop + 2) / 3 * 3 # codon stop new_vset = VariantSet(vset.genomic_pos, set([])) # TODO: this may introduce superfluous AA-s, that is 'Q'->'QP' when a # ''->'P' would be enough. Need to look into it. -- 99% SOLVED. for v in vset: if v.variant_type not in ('FSI', 'FSD'): aa_seq = (new_seq[cstart:start] + v.sequence + new_seq[stop:cstop]).translate() translated_variant = Variant(v.genomic_pos, v.variant_type, aa_seq, 'AA', v.sample_id) # TODO: should we carry over metadata? I think we really should! # for now, let's just keep a simple reference to the original variant translated_variant.log_metadata('origin', v) new_vset.add_variant(translated_variant) new_vset.log_metadata('origin', vset) # TODO: maybe origin should be a first-class attribue not metadata? if len(new_vset) > 0: # frameshift VariantSets would create empty new_vsets, disregard them new_variantsets[(cstart/3, cstop/3)] = new_vset protein.variantsets = new_variantsets protein._trim_after_stop() # now let's see which frameshifts were actually kept. As induced stop codons may have terminated # the translated sequence, there's a chance that later frameshifts are irrelevant. # <= instead of < as the stop codon (Biopython '*') is trimmed away and if a FS induces that # as its first affected AA position it DID play a role in what the sequence has become # although '*' is not part of the protein sequence itself. fs_positions = filter(lambda x: x<=len(protein), fs_positions) used_frameshifts = zip(fs_positions, (fs for _, fs in frameshifts[:len(fs_positions)])) assert protein.get_metadata('frameshifts') == [], ("Someone has tweaked with the 'frameshift'" " field of protein metadata before. May have come from inherited transcript metadata." " Use a different field name in your custom functions.") protein.log_metadata('frameshifts', used_frameshifts) return protein