Ejemplo n.º 1
0
    def translate_with_fs(self, frameshifts=None):
        # frameshifts is a dict in {pos: Variant} form. NOT VariantSet! We are translating
        # with a particular FS combination and NOT calculating possible combinations here.
        if frameshifts is None:
            frameshifts = []
        else:
            frameshifts = sorted(frameshifts)  # should be already sorted, but...

        # the number of bases gained or lost by each frameshift. Positive: gain, negative: lost
        fs_shifts = [(fpos, fpos[0] - fpos[1] + len(fsvar)) for
            fpos, fsvar in frameshifts]

        def reposition(orig_pos):
            start, stop = orig_pos
            new_start, new_stop = start, stop
            for (fs_start, fs_stop), fs_shift in fs_shifts:
                if fs_start <= start < fs_stop or fs_start < stop <= fs_stop:
                    warnings.warn('Watch out, variant inside frameshift! We\'re not ready to handle '
                        'that yet. %s, (%d-%d)' % (self.id, fs_start, fs_stop))
                if start >= fs_stop:  # frameshift happened before variant, so variant shifts
                    new_start += fs_shift
                    new_stop += fs_shift
            return new_start, new_stop

        fs_positions = []
        new_seq = Seq('', generic_nucleotide)
        original_seq = self.sequence[self.cds[0]:]
        next_start = 0
        for (fs_start, fs_stop), fs_var in frameshifts:
            new_seq += original_seq[next_start:fs_start]
            fs_positions.append(len(new_seq)/3)  # register first AA position that current FS affects
            new_seq += fs_var.sequence
            next_start = fs_stop
        else:
            new_seq += original_seq[next_start:]

        protein = Protein(new_seq.translate(), self)

        # now with the new sequence created it's time to translate non-FS variants. Since the frameshifts
        # moved their relative positions around, we have to use their updated locations.

        new_variantsets = {}
        for (start, stop), vset in {reposition(vpos): vset for vpos, vset in self.variantsets.iteritems()}.iteritems():
            cstart = start - (start % 3)  # codon start
            cstop = (stop + 2) / 3 * 3  # codon stop
            new_vset = VariantSet(vset.genomic_pos, set([]))

            # TODO: this may introduce superfluous AA-s, that is 'Q'->'QP' when a
            # ''->'P' would be enough. Need to look into it. -- 99% SOLVED.
            for v in vset:
                if v.variant_type not in ('FSI', 'FSD'):
                    aa_seq = (new_seq[cstart:start] + v.sequence + new_seq[stop:cstop]).translate()
                    translated_variant = Variant(v.genomic_pos, v.variant_type, aa_seq, 'AA', v.sample_id)
                    # TODO: should we carry over metadata? I think we really should!
                    # for now, let's just keep a simple reference to the original variant
                    translated_variant.log_metadata('origin', v)
                    new_vset.add_variant(translated_variant)
                    new_vset.log_metadata('origin', vset)  # TODO: maybe origin should be a first-class attribue not metadata?
            if len(new_vset) > 0:  # frameshift VariantSets would create empty new_vsets, disregard them
                new_variantsets[(cstart/3, cstop/3)] = new_vset

        protein.variantsets = new_variantsets
        protein._trim_after_stop()

        # now let's see which frameshifts were actually kept. As induced stop codons may have terminated
        # the translated sequence, there's a chance that later frameshifts are irrelevant.

        # <= instead of < as the stop codon (Biopython '*') is trimmed away and if a FS induces that
        # as its first affected AA position it DID play a role in what the sequence has become
        # although '*' is not part of the protein sequence itself.
        fs_positions = filter(lambda x: x<=len(protein), fs_positions)
        used_frameshifts = zip(fs_positions, (fs for _, fs in frameshifts[:len(fs_positions)]))

        assert protein.get_metadata('frameshifts') == [], ("Someone has tweaked with the 'frameshift'"
            " field of protein metadata before. May have come from inherited transcript metadata."
            " Use a different field name in your custom functions.")
        protein.log_metadata('frameshifts', used_frameshifts)
        return protein