Example #1
0
    def _parse_variant_feature(feature):
        variant = {
            'pos': feature[1],
            'pos_stop': feature[2],
            'desc': feature[3],
            'variant_id': feature[4],
        }

        variant_url = 'http://web.expasy.org/variant_pages/{}.html'
        variant['variant_url'] = variant_url.format(variant['variant_id'])

        regex = r'(?P<old_aa>[A-Z]+) -> (?P<new_aa>[A-Z]+)'
        match = re.search(regex, variant['desc'])
        if match:
            variant['old_aa'], variant['new_aa'] = match.groups()
            variant['prot_change'] = 'p.{}{}{}'.format(seq3(variant['old_aa']),
                                                       variant['pos'],
                                                       seq3(variant['new_aa']))

        variant['pmids'] = re.findall(r'PubMed:(\d+)', variant['desc'])

        matches = re.search(r'dbSNP:(rs\d+)', variant['desc'])
        if matches:
            assert len(matches.groups()) == 1
            variant['rsid'] = matches.group(1)

        match = re.search(r'\((.+)\)', variant['desc'])
        if match:
            review = re.sub(r'; dbSNP:rs\d+', '', match.group(1))
            variant['review'] = review

        return variant
Example #2
0
 def test_seq1_seq3(self):
     s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr"
     s1 = "MAYWCTKLIGPQNAPYKSHWKT"
     self.assertEqual(seq1(s3), s1)
     self.assertEqual(seq3(s1).upper(), s3.upper())
     self.assertEqual(seq1(seq3(s1)), s1)
     self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
    def residues(self):
        rset = set([(at.resn, int(at.resi), True)
                    for at in self.pymol.cmd.get_model(self.name).atom])
        r = sorted(rset, key=lambda x: x[1])
        s = ''.join(
            self.pymol.cmd.get_fastastr(
                f'{self.name} and chain {self.chain} and not hetatm').split(
                    '\n')[1:])
        first_stated_resn = seq3(s[0]).upper()
        first_structured_resi = r[0][1]
        first_structured_resn = r[0][0]

        if first_structured_resi == 1:
            if first_stated_resn == first_structured_resn:  # case 1. all is good.
                pass
            else:  # case 2. negative numbers??!
                raise NotImplementedError(
                    f'First residues are {r[0:3]} but the sequence is {s}')
        elif first_structured_resi > 1:
            if first_stated_resn == first_structured_resn:  # case 3. The first residue exist is not 1.
                pass
            elif seq3(s[first_structured_resi -
                        1]).upper() == first_structured_resn:
                # case 4. the first stated residue is 1, but does not exist.
                r = [(seq3(s[i]).upper(), i + 1, False)
                     for i in range(1, first_structured_resi)] + r
            else:
                raise NotImplementedError(
                    f'First residues are {r[0:3]} but the sequence is {s}')
        else:
            raise NotImplementedError(
                f'First residues are {r[0:3]} but the sequence is {s}')

        return r
Example #4
0
 def test_seq1_seq3(self):
     s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr"
     s1 = "MAYWCTKLIGPQNAPYKSHWKT"
     self.assertEqual(seq1(s3), s1)
     self.assertEqual(seq3(s1).upper(), s3.upper())
     self.assertEqual(seq1(seq3(s1)), s1)
     self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
Example #5
0
def out_of_frame_description(s1, s2):
    """
    Give the description of an out of frame difference between two
    proteins. Give a description of an inframe difference of two proteins.
    Also give the position at which the proteins start to differ and the
    end positions (to be compatible with the in_frame_description function).

        >>> out_of_frame_description('MTAPQQMT*', 'MTAQQMT*')
        ('p.(Pro4Glnfs*5)', 3, 9, 8)
        >>> out_of_frame_description('MTAPQQMT*', 'MTAQMT*')
        ('p.(Pro4Glnfs*4)', 3, 9, 7)
        >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT*')
        ('p.(Pro4Glnfs*5)', 3, 8, 8)
        >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT')
        ('p.(Pro4Glnfs*?)', 3, 8, 7)

    @arg s1: The original protein.
    @type s1: unicode
    @arg s2: The mutated protein.
    @type s2: unicode

    @return: A tuple of:
        - unicode ; Protein description of the change.
        - int     ; First position of the change.
        - int     ; Last position of the first protein.
        - int     ; Last position of the second protein.
    @rtype: tuple(unicode, int, int, int)

    @todo: More intelligently handle longest_common_prefix().
    """
    s1_seq = s1.rstrip("*")
    s2_seq = s2.rstrip("*")
    lcp = len(longest_common_prefix(s1_seq, s2_seq))

    if lcp == len(s2_seq):  # NonSense mutation.
        if lcp == len(s1_seq):  # Is this correct?
            return ("p.(=)", 0, 0, 0)
        return ("p.(%s%i*)" % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp)
    if lcp == len(s1_seq):
        # http://www.hgvs.org/mutnomen/FAQ.html#nostop
        stop = str(abs(len(s1_seq) - len(s2_seq))) if "*" in s2 else "?"

        return (
            "p.(*%i%sext*%s)" % (len(s1_seq) + 1, seq3(s2[len(s1_seq)]), stop),
            len(s1_seq),
            len(s1),
            len(s2),
        )

    # http://www.hgvs.org/mutnomen/FAQ.html#nostop
    stop = str(len(s2_seq) - lcp + 1) if "*" in s2 else "?"

    return (
        "p.(%s%i%sfs*%s)" % (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), stop),
        lcp,
        len(s1),
        len(s2),
    )
Example #6
0
def out_of_frame_description(s1, s2):
    """
    Give the description of an out of frame difference between two
    proteins. Give a description of an inframe difference of two proteins.
    Also give the position at which the proteins start to differ and the
    end positions (to be compatible with the in_frame_description function).

        >>> out_of_frame_description('MTAPQQMT*', 'MTAQQMT*')
        ('p.(Pro4Glnfs*5)', 3, 9, 8)
        >>> out_of_frame_description('MTAPQQMT*', 'MTAQMT*')
        ('p.(Pro4Glnfs*4)', 3, 9, 7)
        >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT*')
        ('p.(Pro4Glnfs*5)', 3, 8, 8)
        >>> out_of_frame_description('MTAPQQT*', 'MTAQQMT')
        ('p.(Pro4Glnfs*?)', 3, 8, 7)

    @arg s1: The original protein.
    @type s1: unicode
    @arg s2: The mutated protein.
    @type s2: unicode

    @return: A tuple of:
        - unicode ; Protein description of the change.
        - int     ; First position of the change.
        - int     ; Last position of the first protein.
        - int     ; Last position of the second protein.
    @rtype: tuple(unicode, int, int, int)

    @todo: More intelligently handle longest_common_prefix().
    """
    s1_seq = s1.rstrip('*')
    s2_seq = s2.rstrip('*')
    lcp = len(longest_common_prefix(s1_seq, s2_seq))

    if lcp == len(s2_seq): # NonSense mutation.
        if lcp == len(s1_seq): # Is this correct?
            return ('p.(=)', 0, 0, 0)
        return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp)
    if lcp == len(s1_seq):
        # http://www.hgvs.org/mutnomen/FAQ.html#nostop
        stop = unicode(abs(len(s1_seq) - len(s2_seq))) if '*' in s2 else '?'

        return ('p.(*%i%sext*%s)' % \
                (len(s1_seq) + 1, seq3(s2[len(s1_seq)]), stop),
                len(s1_seq), len(s1), len(s2))

    # http://www.hgvs.org/mutnomen/FAQ.html#nostop
    stop = unicode(len(s2_seq) - lcp + 1) if '*' in s2 else '?'

    return ('p.(%s%i%sfs*%s)' % \
            (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), stop),
            lcp, len(s1), len(s2))
Example #7
0
    def process_aa_position(self, bdb, gene, r, ra, record, genepos):
        ref_ann = str(record.seq.translate())[r.AApos - 1]
        if r.AAref != ref_ann:
            self.stderr.write((
                "Reference AA does not match the current annotation '%s' != '%s' "
                % (ref_ann, r.AAref)) + json.dumps(r.to_dict()))
            return None
        if genepos:
            mut = record.seq[:genepos - 1] + Seq(
                r.ALT) + record.seq[genepos - 2 + len(r.ALT):]
            alt_ann = str(mut.translate())[r.AApos - 1]
            if r.AAalt == "fs" and (len(r.ALT) - 1) % 3 != 0:
                self.stderr.write(("Invalid FS" + json.dumps(r.to_dict())))
                return None

            if (r.AAalt != "fs") and (alt_ann != r.AAalt):
                self.stderr.write((
                    "Alternative AA does not match the nucleotide mutation effect '%s' != '%s' "
                    % (ref_ann, r.AAref)) + json.dumps(r.to_dict()))
                return None

        effect_query = Effect.objects.filter(transcript=r.LocusTag,
                                             gene=gene,
                                             ref_organism=bdb,
                                             aa_pos=r.AApos,
                                             aa_ref=r.AAref,
                                             aa_alt=r.AAalt)
        # hgvs_p="%s%i%s" % (r.AAref, r.AApos, r.AAalt))
        if not effect_query.exists():
            variant_type = 'frameshift_variant' if "fs" == r.AAalt else (
                'stop_gained' if "STOP" == r.AAalt else "missense_variant"
                if r.AAref != r.AAalt else 'synonymous_variant')

            new_effect = Effect(transcript=r.LocusTag,
                                ref_organism=bdb,
                                variant_type=variant_type,
                                gene=gene)
            new_effect.aa_pos = r.AApos
            new_effect.aa_ref = r.AAref
            new_effect.aa_alt = "*" if r.AAalt in ["fs", "STOP"] else r.AAalt
            new_effect.hgvs_p = (
                (seq3(new_effect.aa_ref) if new_effect.aa_ref != "*" else "*")
                + str(new_effect.aa_pos) +
                (seq3(new_effect.aa_alt) if new_effect.aa_alt != "*" else "*"))
            new_effect.save()

        else:
            new_effect = effect_query.get()

        ra.effect = new_effect
        ra.save()
        return new_effect
Example #8
0
def new_cc(sequences, coords):
    n_cc_helices = len(sequences)
    seq_len = int(
        min(len(sequences[0]) * 5, coords.shape[0] / n_cc_helices) / 5)
    new_model = PDB.Model.Model(0)
    segid = '    '
    atomname = ['N', 'CA', 'C', 'O', 'CB']
    bfactor = 30
    occupancy = 1
    altloc = ' '
    fullname = [' N  ', ' CA ', ' C  ', ' O  ', ' CB ']
    element = ['N', 'C', 'C', 'O', 'C']
    serial_number = 1
    chain_id_base = ord('A')
    for chain_i, sequence in enumerate(sequences):
        chain_id = chr(chain_id_base + chain_i)
        new_chain = PDB.Chain.Chain(chain_id)
        for res_i in range(1, seq_len + 1):
            res_id = (' ', res_i, ' ')
            resname = seq3(sequence[res_i - 1]).upper()
            new_res = PDB.Residue.Residue(res_id, resname, segid)
            for atom_i in range(5):
                new_atom = PDB.Atom.Atom(atomname[atom_i],
                                         coords[serial_number - 1], bfactor,
                                         occupancy, altloc, fullname[atom_i],
                                         serial_number, element[atom_i])
                new_res.add(new_atom)
                serial_number += 1
            new_chain.add(new_res)
        new_model.add(new_chain)
    return new_model
Example #9
0
def mutate_codon(codon_in, codon_use_table):
    """Select a synonymous codon in accordance with the frequency of use
    in the host organism.

    Args:
        codon_in (Bio.Seq.Seq): A single codon.
        codon_use_table (dict{str, list[list, list]}): A dictionary with
            each amino acid three-letter code as keys, and a list of two
            lists as values. The first list is the synonymous codons that
            encode the amino acid, the second is the frequency with which
            each synonymous codon is used.

    Returns:
        Bio.Seq.Seq: A new codon.
    """
    AA = seq3(
        CodonTable.standard_dna_table.forward_table[str(codon_in)]).upper()

    synonymous_codons, codon_use_freq = codon_use_table[AA]
    if len(synonymous_codons) == 1:
        return codon_in

    # pick new codon
    codon_out = codon_in
    while codon_in == codon_out:
        codon_out = random.choices(synonymous_codons, codon_use_freq).pop()

    logger.detail("Mutating {} codon from {} to {}".format(
        AA, codon_in, codon_out))

    return codon_out
Example #10
0
def get_protein():
    ''' Translates DNA sequence entered by user.
    Parameters
    __________
    No parameters only global variable sequence_text required.

    Return
    ______
    Separate showinfo box with converted amino acid sequence.'''

    str_sequence_text = str(sequence_text.get())
    if (((len(str_sequence_text)) % 3) == 0) and (len(str_sequence_text) != 0):
        dna = Seq(str_sequence_text, IUPAC.unambiguous_dna)
        mrna = dna.transcribe()
        protein = mrna.translate()
        three_letter_aa = str(seq3(protein))
        new_dashed_3_letter_aa = ""
        for index, letter in enumerate(three_letter_aa):
            if (index % 3 == 0) and (index != 0):
                new_dashed_3_letter_aa += "-"
            new_dashed_3_letter_aa += letter
        messagebox.showinfo(
            "!", "Final Converted Protein is: " + new_dashed_3_letter_aa)
    else:
        messagebox.showerror(
            "!",
            "Sequence needs to be a multiple of 3 or sequence should not be 0."
        )
Example #11
0
def back_translate(self):
    """Return the DNA sequence from an amino acid sequence by creating a new Seq object.

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import IUPAC
    >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
    >>> my_protein
    Seq('MAIVMGR', IUPACProtein())
    >>> my_protein.back_translate()
    Seq('ATGGCCATTGTAATGGGCCGCTG', IUPACUnambiguousDNA())

    Trying to back-transcribe a DNA or RNA sequence raises an
    exception:

    >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUG", IUPAC.unambiguous_rna)
    >>> messenger_rna.back_translate()
    Traceback (most recent call last):
    ...
    ValueError: Nucleic acids cannot be back translated!
    """
    base = Bio.Alphabet._get_base_alphabet(self.alphabet)
    if not isinstance(base, Bio.Alphabet.ProteinAlphabet):
        raise ValueError("Nucleic acids cannot be back translated!")

    # right now this just uses the most-prevalent codon for each AA
    # TODO: select codons with a weighted average using random.choice
    return Seq(
        "".join([
            CodonUsage.SynonymousCodons[seq3(AA).upper()][0]
            for AA in str(self)
        ]),
        IUPAC.unambiguous_dna,
    )
Example #12
0
def back_translate(self):
    """Return the DNA sequence from an amino acid sequence by creating a new Seq object.
    The first codon in the synonymous codons list is always chosen for each amino acid;
    codon optimization is required after back translation.

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import IUPAC
    >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
    >>> my_protein
    Seq('MAIVMGR', IUPACProtein())
    >>> my_protein.back_translate()
    Seq('ATGGCCATTGTAATGGGCCGCTG', IUPACUnambiguousDNA())

    Trying to back-transcribe a DNA or RNA sequence raises an
    exception:

    >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUG", IUPAC.unambiguous_rna)
    >>> messenger_rna.back_translate()
    Traceback (most recent call last):
    ...
    ValueError: Nucleic acids cannot be back translated!
    """
    base = Bio.Alphabet._get_base_alphabet(self.alphabet)
    if not isinstance(base, Bio.Alphabet.ProteinAlphabet):
        raise ValueError("Nucleic acids cannot be back translated!")

    # always use the first codon in the synonymous codons list for each AA
    return Seq(
        "".join([
            CodonUsage.SynonymousCodons[seq3(AA).upper()][0]
            for AA in str(self).upper()
        ]),
        IUPAC.unambiguous_dna,
    )
Example #13
0
def show(seq, start=0, stop=None, width=None, peprep=3):
    """
        Affiche une représentation d'une séquence nucléotique ou peptidique avec règle graduée
	
        arguments:
            seq: Seq ou SeqRecord à représenter
            start et stop: limite de la représentation. Les arguments start et stop doivent être des entiers (int); 
            width: largeur de la représentation. width doit être un multiple de 10. 
            peprep: 1 ou 3: représentation des acides aminés. Par défaut, la représentation des seq peptidique (peprep) et de 3 lettres/aa
            
        """

    if testalpha(seq) == generic_nucleotide or testalpha(
            seq) == IUPAC.unambiguous_dna or testalpha(
                seq) == IUPAC.unambiguous_rna:

        return shown(seq, start=start, stop=stop, width=width)

    #elif testalpha(seq) == IUPAC.protein or testalpha(seq) == IUPAC.extended_protein:

    elif langage(seq) == 1 or langage(seq) == 3:

        if peprep == 3 and langage(seq) == 1:

            seq = toSeq(seq)
            seq = seq3(seq)

        return showp(seq, start=start, stop=stop, width=width)
Example #14
0
    def write_pdb(self, pdb_path):
        """
            Write a pdb file by threading the query sequence on the template CA coordinates.

            Args:
                pdb_path (str): Path of the pdb file to create.
        """
        with open(pdb_path, "w") as file:
            # Extra informations on the template used to generate the pdb file
            file.write(
                "REMARK Threading of query sequence on the {:s} template #{:d}.\n"
                .format(self.template.name, self.num))
            ind = 0
            count_atom = 1
            for count_res in range(self.query.first, self.query.last + 1):
                res_t = self.template.residues[ind]
                res_q = self.query.residues[ind]
                if res_q.name == "-" or res_t.name == "-":
                    ind += 1
                    continue
                # # N "ATOM" line
                file.write(
                    "{:6s}{:5d} {:^4s} {:>3s}{:>2s}{:4d}{:>12.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}{:>12s}\n"
                    .format("ATOM", count_atom, "N",
                            seq3(res_q.name).upper(), "A", count_res,
                            res_t.n_atom.coords[0], res_t.n_atom.coords[1],
                            res_t.n_atom.coords[2], 1.00, 0, "N"))
                count_atom += 1
                # CA "ATOM" line
                file.write(
                    "{:6s}{:5d} {:^4s} {:>3s}{:>2s}{:4d}{:>12.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}{:>12s}\n"
                    .format("ATOM", count_atom, "CA",
                            seq3(res_q.name).upper(), "A", count_res,
                            res_t.ca_atom.coords[0], res_t.ca_atom.coords[1],
                            res_t.ca_atom.coords[2], 1.00, 0, "C"))
                count_atom += 1
                # C "ATOM" line
                file.write(
                    "{:6s}{:5d} {:^4s} {:>3s}{:>2s}{:4d}{:>12.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f}{:>12s}\n"
                    .format("ATOM", count_atom, "C",
                            seq3(res_q.name).upper(), "A", count_res,
                            res_t.c_atom.coords[0], res_t.c_atom.coords[1],
                            res_t.c_atom.coords[2], 1.00, 0, "C"))
                count_atom += 1
                ind += 1
            # The two last lines of the created pdb file ("END" and "TER" lines)
            file.write("END\n")
Example #15
0
def prepare_output(AAs, count_enriched_AAs):
    """organize output such that amino acids are grouped by category"""

    output = []
    for AA in AAs:
        AA_3 = (seq3(AA)).upper()
        if AA_3 in count_enriched_AAs:
            output.append(count_enriched_AAs[AA_3])
        else:
            output.append(0)
    return output
Example #16
0
def PrintMutationPair(ccd_pair):
    fileout = "mut_tmp"
    fileobj = open(fileout, "w")
    for pair in ccd_pair:
        uid  = pair[0]
        seq  = pair[1]
        idx  = 1
        #if uid == "Q9HAN9":
        #    print seq
        for nu3 in [seq[i:(i+3)] for i in xrange(0, len(seq), 3)]:
            #print nu3
            orig_aa, mutate_aa = GenerateAllPossibleMutation(nu3)
            if orig_aa == "*":
                continue
            for each_mutate in mutate_aa:
                content = [uid, idx, nu3, seq3(orig_aa).upper(), seq3(each_mutate).upper()]
                line = "\t".join(map(str, content))
                fileobj.write(line + "\n")
            idx = idx + 1
    fileobj.close()
Example #17
0
def out_of_frame_description(s1, s2):
    """
    Give the description of an out of frame difference between two
    proteins. Give a description of an inframe difference of two proteins.
    Also give the position at which the proteins start to differ and the
    end positions (to be compatible with the in_frame_description function).

        >>> out_of_frame_description('MTAPQQMT', 'MTAQQMT')
        ('p.(Pro4Glnfs*5)', 3, 8, 7)
        >>> out_of_frame_description('MTAPQQMT', 'MTAQMT')
        ('p.(Pro4Glnfs*4)', 3, 8, 6)
        >>> out_of_frame_description('MTAPQQT', 'MTAQQMT')
        ('p.(Pro4Glnfs*5)', 3, 7, 7)

    @arg s1: The original protein.
    @type s1: unicode
    @arg s2: The mutated protein.
    @type s2: unicode

    @return: A tuple of:
        - unicode ; Protein description of the change.
        - int     ; First position of the change.
        - int     ; Last position of the first protein.
        - int     ; Last position of the second protein.
    @rtype: tuple(unicode, int, int, int)

    @todo: More intelligently handle longest_common_prefix().
    """
    lcp = len(longest_common_prefix(s1, s2))

    if lcp == len(s2):  # NonSense mutation.
        if lcp == len(s1):  # Is this correct?
            return ('p.(=)', 0, 0, 0)
        return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp)
    if lcp == len(s1):
        return ('p.(*%i%sext*%i)' % \
                (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))),
                len(s1), len(s1), len(s2))
    return ('p.(%s%i%sfs*%i)' % \
            (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), len(s2) - lcp + 1),
            lcp, len(s1), len(s2))
Example #18
0
def out_of_frame_description(s1, s2):
    """
    Give the description of an out of frame difference between two
    proteins. Give a description of an inframe difference of two proteins.
    Also give the position at which the proteins start to differ and the
    end positions (to be compatible with the in_frame_description function).

        >>> out_of_frame_description('MTAPQQMT', 'MTAQQMT')
        ('p.(Pro4Glnfs*5)', 3, 8, 7)
        >>> out_of_frame_description('MTAPQQMT', 'MTAQMT')
        ('p.(Pro4Glnfs*4)', 3, 8, 6)
        >>> out_of_frame_description('MTAPQQT', 'MTAQQMT')
        ('p.(Pro4Glnfs*5)', 3, 7, 7)

    @arg s1: The original protein.
    @type s1: unicode
    @arg s2: The mutated protein.
    @type s2: unicode

    @return: A tuple of:
        - unicode ; Protein description of the change.
        - int     ; First position of the change.
        - int     ; Last position of the first protein.
        - int     ; Last position of the second protein.
    @rtype: tuple(unicode, int, int, int)

    @todo: More intelligently handle longest_common_prefix().
    """
    lcp = len(longest_common_prefix(s1, s2))

    if lcp == len(s2): # NonSense mutation.
        if lcp == len(s1): # Is this correct?
            return ('p.(=)', 0, 0, 0)
        return ('p.(%s%i*)' % (seq3(s1[lcp]), lcp + 1), lcp, len(s1), lcp)
    if lcp == len(s1) :
        return ('p.(*%i%sext*%i)' % \
                (len(s1) + 1, seq3(s2[len(s1)]), abs(len(s1) - len(s2))),
                len(s1), len(s1), len(s2))
    return ('p.(%s%i%sfs*%i)' % \
            (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp]), len(s2) - lcp + 1),
            lcp, len(s1), len(s2))
Example #19
0
 def labeling(self, label):
     if label == None:
         if len(self.amino_acids) == 1:
             return seq3(self.amino_acids[0]) + str(self.position + 1)
         else:
             self.amino_acids = sorted(self.amino_acids)
             sets = []
             for x in self.sthereochemistry:
                 if 'Similar' in x:
                     sets.append(x.split('(')[0])
                 else:
                     sets.append(x)
             sets = sorted(list(set(sets)))
             if len(sets) > 1:
                 feature = ', '.join(sets)
             else:
                 feature = sets[0]
             return '%s (%s or %s) %d' % (
                 feature, ', '.join(map(seq3, self.amino_acids[:-1])),
                 seq3(self.amino_acids[-1]), self.position + 1)
     return label
Example #20
0
    def _generate_template(self):
        bbnames = ['C', 'O', 'CA', 'N', 'OXT']
        lgt = len(self.pep_seq)
        if self.custom_template is None:
            tpl = _TEMPLATES_DIR / str(lgt) + 'mer.pdb'
        else:
            tpl = self.custom_template
        tpl = prody.parsePDB(tpl)

        for r, newname in zip(tpl.iterResidues(), self.pep_seq):
            r.setResname(seq3(newname).upper())
        tpl = tpl.select('name ' + ' '.join(bbnames)).copy()
        tpl.setChids('B')
        self._tpl = tpl
Example #21
0
    def __proteinDescription(self):
        """
        Give the HGVS description of the raw variant stored in this class.

        Note that this function relies on the absence of values to make the
        correct description. Also see the comment in the class definition.

        @returns: The HGVS description of the raw variant stored in this class.
        @rtype: unicode
        """
        if self.type == "unknown":
            return "?"
        if not self.start:
            return "="

        descr = ""
        if not self.deleted:
            if self.type == "ext":
                descr += '*'
            else:
                descr += "%s" % seq3(self.startAA)
        #if
        else:
            descr += "%s" % seq3(self.deleted)
        descr += "%i" % self.start
        if self.end:
            descr += "_%s%i" % (seq3(self.endAA), self.end)
        if self.type not in ["subst", "stop", "ext", "fs"]:
            descr += self.type
        if self.inserted:
            descr += "%s" % seq3(self.inserted)

        if self.type == "stop":
            return descr + '*'
        if self.term:
            return descr + "%s*%i" % (self.type, self.term)
        return descr
Example #22
0
def to_string_aminoacids3(item, group_indices='all', check=True):

    if check:

        digest_item(item, 'string:aminoacids1')
        group_indices = digest_group_indices(group_indices)

    try:
        from Bio.SeqUtils import seq3
    except:
        raise LibraryNotFoundError('biopython')

    tmp_item = seq3(item)

    return tmp_item
Example #23
0
def mutate_codon(codon_in, codon_use_table):
    AA = seq3(
        CodonTable.standard_dna_table.forward_table[str(codon_in)]).upper()

    synonymous_codons, codon_use_freq = codon_use_table[AA]
    if len(synonymous_codons) == 1:
        return codon_in

    # pick new codon
    codon_out = codon_in
    while codon_in == codon_out:
        codon_out = random.choices(synonymous_codons, codon_use_freq).pop()

    logger.detail("mutating [{0}] codon from {1} to {2}".format(
        AA, codon_in[1], codon_out))

    return codon_out
Example #24
0
def resample_codons(dna_sequence, codon_use_by_aa):
    """[summary]

    Args:
        dna_sequence ([type]): [description]
        codon_use_by_aa ([type]): [description]

    Returns:
        [type]: [description]
    """

    resampled_dna = "".join([
        random.choices(*codon_use_by_aa[seq3(AA).upper()]).pop()
        for AA in dna_sequence.translate()
    ])

    return Seq(resampled_dna, IUPAC.unambiguous_dna)
def get_protein():
    str_sequence_text = str(sequence_text.get())
    if (((len(str_sequence_text)) % 3) == 0) and (len(str_sequence_text) != 0):
        dna = Seq(str_sequence_text, IUPAC.unambiguous_dna)
        mrna = dna.transcribe()
        protein = mrna.translate()
        three_letter_aa = str(seq3(protein))
        new_dashed_3_letter_aa = ""
        for index, letter in enumerate(three_letter_aa):
            if (index % 3 == 0) and (index != 0):
                new_dashed_3_letter_aa += "-"
            new_dashed_3_letter_aa += letter
        messagebox.showinfo(
            "!", "Final Converted Protein is: " + new_dashed_3_letter_aa)
    else:
        messagebox.showerror(
            "!",
            "Sequence needs to be a multiple of 3 or sequence should not be 0."
        )
Example #26
0
def get_percentage_aa(value):
    traces = []
    file_path = choose_fasta(value)
    with open(file_path, "r") as file_fasta:
        for entry2 in SeqIO.parse(file_fasta, "fasta"):
            id_prot = entry2.id.split("|")
            id_chain = id_prot[0].split(":")
            seq = str(entry2.seq)
            X = pp.ProteinAnalysis(seq)
            percent_aa = X.get_amino_acids_percent()
            aa_list = []
            aa_percent = []
            for key, value in percent_aa.items():
                aa_name = seq3(key)
                aa_list.append(aa_name)
                aa_percent.append(value * 100)
            traces.append(
                go.Bar(x=aa_list, y=aa_percent, name="Chain " + id_chain[1]))
    return traces
Example #27
0
def mutate_codon(codon_in, codon_use_table):
    """Select a synonymous codon in accordance with the frequency of use
    in the host organism.
    Args:
    codon_in (Bio.Seq.Seq): A single codon.
    Returns:
        Bio.Seq.Seq: A new codon.
    """
    amino_acid = seq3(
        CodonTable.standard_dna_table.forward_table[str(codon_in)]).upper()
    synonymous_codons, codon_use_freq = codon_use_table[amino_acid]
    if len(synonymous_codons) == 1:
        return codon_in

    # pick new codon
    codon_out = codon_in
    while codon_in == codon_out:
        codon_out = random.choices(synonymous_codons, codon_use_freq).pop()

    return codon_out
Example #28
0
def resample_codons(dna_sequence, codon_use_table):
    """Generate a new DNA sequence by swapping synonymous codons.
    Codons are selected in accordance with their frequency of occurrence in
    the host organism.

    Args:
        dna_sequence (Bio.Seq.Seq): A read-only representation of
            the DNA sequence.
        codon_use_table (dict{str, list[list, list]}): A dictionary with
            each amino acid three-letter code as keys, and a list of two
            lists as values. The first list is the synonymous codons that
            encode the amino acid, the second is the frequency with which
            each synonymous codon is used.

    Returns:
        Bio.Seq.Seq: A read-only representation of the new DNA sequence.
    """
    resampled_dna = "".join([
        random.choices(*codon_use_table[seq3(AA).upper()]).pop()
        for AA in dna_sequence.translate()
    ])

    return Seq(resampled_dna, IUPAC.unambiguous_dna)
Example #29
0
    list_of_records = accepted

    if verbose: print len(list_of_records), 'in the middle'
    # accepted = [list_of_records.pop()]
    accepted = []
    while len(list_of_records) > 0:
        r = list_of_records.pop(0)
        duplicate = False
        for q in accepted:
            if r[0:6] == q[0:6]:
                duplicate = True
                break
        if not duplicate:
            accepted.append(r)

    list_of_records = sorted(accepted, key=lambda x: float(x[3]), reverse=True)
    if verbose: print len(list_of_records), 'after filter'
    with open(seqid + ".gff", 'a') as gff:
        for rec in list_of_records:
            strand = rec[5]
            start = rec[1]
            end = rec[2]
            aux = 'product=tRNA-' + seq3(
                rec[0][0]) + ';anticodon=(' + rec[4].lower(
                ) + ');anticodon_position=' + str(
                    rec[6]) + ';label=' + rec[0] + ';structure=' + rec[7]
            linia_output = seqid + '\t' + 'infernal\ttRNA' + '\t' + start + '\t' + end + '\t' + str(
                rec[3]) + '\t' + strand + '\t.\t' + aux + '\n'
            if int(start) <= sequence_len:
                gff.write(linia_output)
Example #30
0
#Method 2: Direct Translation
aa = dna_seq.translate()
print(aa)

#Custom stop codon
print(mrna.translate(stop_symbol="@"))

#Back transcribe to DNA, same as dna_seq
print(mrna.back_transcribe())

#Join steps
print(dna_seq.transcribe().translate())

#3 letter version of protein
three_letter_aa = seq3(str(aa))
print(three_letter_aa)

#Back to 1 letter version of protein same as original aa
one_letter_aa = seq1(str(three_letter_aa))
print(one_letter_aa)

#Methods on Bio.Data
print(dir(CodonTable))

#CodonTable for DNA by name
print(CodonTable.unambiguous_dna_by_name["Standard"])

#CodonTable for RNA by name
print(CodonTable.unambiguous_rna_by_name["Standard"])
Example #31
0
def load_cdna_and_polyA(paths,organism,pep_dict,exon_info,failed):
    organism_out = open("./Data/"+organism+"_polyA.data",'w')
    cdna = list(SeqIO.parse(open(paths["cdna"],'r'),"fasta"))
    cdna_size = len(cdna)
    cdna_counter = 0
    next_step = 0
    for cd in cdna:
        cdna_counter += 1
        if  (float(cdna_counter*100)/cdna_size) >= next_step:
            next_step += 10
            print str(int(float(cdna_counter*100)/cdna_size))+"%"
        description =  dict([z.split(":",1) for z in cd.description.split()[1:] if ":" in z])
        cdna_transcript_id = cd.id
        cdna_gene = description["gene"]
        
        gt = (cdna_gene,cdna_transcript_id)
        if gt in pep_dict:
            
            for p in pep_dict[gt]:
                
                protein_id = p.id
                gene_id = str(gt[0])
                pep_sequence = str(seq3(p.seq))
                cdna_sequence = str(cd.seq)
                cdna_translated_list = []
                
                cdna_start = 0
                cdna_stop = 0
                
                for x in range(3):
                    cdna_translated_list.append(seq3(str(Seq(cdna_sequence[x:]+"N"*(3-len(cdna_sequence[x:])%3)).translate())))
                cut_found = [v for v in range(len(cdna_translated_list)) if pep_sequence in cdna_translated_list[v]]
                
                #easy
                if pep_sequence == cdna_translated_list[0]:
                    cdna_start = 0
                    cdna_stop = len(cdna_sequence)
                    proper_seq  = cdna_sequence
                    AAA_list = findPolyA(proper_seq)
                    grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id])
                        
                #cutting
                elif cut_found:
                    for c in cut_found:
                        idx = c+ cdna_translated_list[c].find(pep_sequence)
                        cdna_start = idx
                        cdna_stop = idx+len(pep_sequence)
                        proper_seq = cdna_sequence[idx:(idx+len(pep_sequence))]
                        AAA_list = findPolyA(proper_seq)
                        grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id])
                        
                #alignment
                else:
                    prot_seq = SeqRecord(Seq(seq1(pep_sequence)),id = "prot_seq")
                    y = open(organism+"prot.fasta",'w')
                    SeqIO.write(prot_seq, y, "fasta")
                    y.close()
                    best = []
                    for i in range(3):
                        cdna_seq = SeqRecord(Seq(cdna_sequence[i:len(cdna_sequence)-((len(cdna_sequence)-i)%3)]).translate(stop_symbol="W"),id="cdna_seq")
                        k = open(organism+"cdna.fasta",'w')
                        SeqIO.write(cdna_seq, k , "fasta")
                        k.close()
                        output = NcbiblastpCommandline(query=organism+"prot.fasta", subject=organism+"cdna.fasta", outfmt=5)()[0]
                        blast_result_records = list(NCBIXML.parse(StringIO(output)))
                        for bl_res in blast_result_records:
                            for z in bl_res.alignments:
                                for h in z.hsps:
                                    best.append((h.query,h.sbjct,i,h.sbjct_start, h.query_start,h.score))
                    if best:
                        l = sorted(best,key=lambda x:x[-1])[-1]
                        proper_seq = cdna_sequence[l[2]+(int(l[3])-1)*3:l[2]+((int(l[3])-1)+len(l[1]))*3]
                        AAA_list = findPolyA(proper_seq)
                        cdna_start = l[2]+(int(l[3])-1)*3
                        cdna_stop = l[2]+((int(l[3])-1)+len(l[1]))*3
                        grab_AAA_information(AAA_list,organism_out, cdna_transcript_id,cdna_start,proper_seq,exon_info[cdna_transcript_id])
                    else:
                        failed.write(",".join([protein_id,cdna_transcript_id,gene_id,pep_sequence,cdna_sequence])+"\n")
                    
                    
                    
                    os.remove(organism+"cdna.fasta")
                    os.remove(organism+"prot.fasta")
        
                cdna = schema.Cdna(transcript_id = cdna_transcript_id, gene_id = cdna_gene, nucleotide_sequence=str(cd.seq),organism_name =organism, cdna_start = cdna_start, cdna_stop =cdna_stop)
                db.session.add(cdna)
    def get_aa_comp(v, alphabet, seq):
        answer = ''
        break_and_return = False
        if v is None or len(v) < 2:
            pass
        else:
            try:
                subsequence = seq[v[0]:v[1]]
            except TypeError:
                answer = html.Table([])
                break_and_return = True

            if not break_and_return:
                # default - file represents a protein
                aa_string = subsequence

                if alphabet == 'dna':
                    # remove partial codons
                    subsequence = subsequence[:-(len(subsequence) % 3)] \
                        if (len(subsequence) % 3) != 0 \
                        else subsequence
                    s = Seq(subsequence, generic_dna)
                    try:
                        aa_string = str(s.translate())
                    except TranslationError:
                        answer = "Sequence does not represent DNA."
                        break_and_return = True
                elif alphabet == 'rna':
                    subsequence = subsequence[:-(len(subsequence) % 3)] \
                        if (len(subsequence) % 3) != 0 \
                        else subsequence
                    s = Seq(subsequence, generic_rna)
                    try:
                        aa_string = str(s.translate())
                    except TranslationError:
                        answer = "Sequence does not represent RNA."
                        break_and_return = True

                if not break_and_return:
                    # all unique amino acids
                    amino_acids = list(set(aa_string))
                    aa_counts = [
                        {
                            'aa': seq3(aa),
                            'count': aa_string.count(aa)
                        }
                        for aa in amino_acids
                    ]

                    # sort by most common AA in sequence
                    aa_counts.sort(
                        key=lambda x: x['count'],
                        reverse=True
                    )

                    summary = [
                        html.Tr([html.Td(aac['aa']),
                                 html.Td(str(aac['count']))])
                        for aac in aa_counts]

                    # include explanation for translation if necessary
                    if alphabet in ('dna', 'rna') \
                            and len(summary) > 0:
                        answer = [
                            '(Protein translated from {}: {})'.format(
                                alphabet.upper(),
                                aa_string
                            ),
                            html.Table(summary)
                        ]
                    else:
                        answer = html.Table(summary)
        return answer
Example #33
0
def get_all_codons(vcf_rec,fasta_dict,three_letter_abbr=True):
    """
    Takes a vcf and ref_fasta_dictionary, and returns a list of Record objects with
    codons and aminoacid translations using the standard alphabet.
    CRITICALLY, this functions assumes that all sequences are inframe.
    It will work well for CDS and handcurated genes.

    :param vcf_rec: vcf iterator obj from PyVCF4.1
    :param fasta_dict: Dictionary of Bio.Seq objs ident
    :return: list of modified Record objects that can used write out a file of SNPs

    """
    ret_list = []

    for rec in vcf_rec:
        print(rec.FILTER)
        if rec.POS % 3 == 0 :
            """
            This indicates snp in the 3rd codon position.
            We need to extract the 2 previous nucleotides from 
            the fasta as well. 
            12[3]
            """
            start = rec.POS - 3
            stop = rec.POS
            codon_pos = 3
        elif rec.POS % 3 == 2:
            """
            2nd Codon position
            1[2]3
            """
            start = rec.POS - 2
            stop = rec.POS + 1
            codon_pos = 2
        elif rec.POS % 3 == 1 :
            """
            1st Codon Position
            [1]23
            
            """
            start = rec.POS - 1
            stop = rec.POS + 2
            codon_pos = 1

        """
        get codons and amino acids for mutations
        
        """

        rec.CODON_POS = codon_pos
        rec.CODON_START = start
        rec.CODON_STOP = stop
        rec.CODON_REF = fasta_dict[rec.CHROM].seq[start:stop]
        rec.CODON_ALT = []
        rec.AMINO_REF = rec.CODON_REF.translate()
        if three_letter_abbr is True:
            rec.AMINO_REF = seq3(rec.AMINO_REF)
        rec.AMINO_ALT = []
        for a in rec.ALT :
            """
            ALT alleles come in a list, so alt_codons must follow suit.
            Here we are able to use codon_pos -1 to adjust for python string
            slicing, to replace the ref with the ALT
            """


            alt_codon = list(rec.CODON_REF)
            alt_codon[codon_pos - 1] = a
            alt_codon_str = []
            for nuc in alt_codon:
                alt_codon_str.append(str(nuc))
            del alt_codon # going to create it again as a Seq obj.
            alt_codon = Seq(''.join(alt_codon_str),)


            rec.CODON_ALT.append(alt_codon)
            if three_letter_abbr is True :
                rec.AMINO_ALT.append(seq3(alt_codon.translate()))
            else:
                rec.AMINO_ALT.append(seq3(alt_codon.translate()))
            #print(''.join(alt_codon_str), rec.CODON_REF, alt_codon.translate(),rec.AMINO_REF)
        ret_list.append(rec)
    return ret_list
Example #34
0
def in_frame_description(s1, s2):
    """
    Give a description of an inframe difference of two proteins. Also give
    the position at which the proteins start to differ and the positions at
    which they are the same again.

        >>> in_frame_description('MTAPQQMT*', 'MTAQQMT*')
        ('p.(Pro4del)', 3, 4, 3)
        >>> in_frame_description('MTAPQQMT*', 'MTAQMT*')
        ('p.(Pro4_Gln5del)', 3, 5, 3)
        >>> in_frame_description('MTAPQQT*', 'MTAQQMT*')
        ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6)
        >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ*')
        ('p.(*9Metext*2)', 8, 9, 11)
        >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ')
        ('p.(*9Metext*?)', 8, 9, 10)

    @arg s1: The original protein.
    @type s1: unicode
    @arg s2: The mutated protein.
    @type s2: unicode

    @return: A tuple of:
        - unicode ; Protein description of the change.
        - int     ; First position of the change.
        - int     ; Last position of the change in the first protein.
        - int     ; Last position of the change in the second protein.
    @rtype: tuple(unicode, int, int, int)

    @todo: More intelligently handle longest_common_prefix().
    @todo: Refactor this code (too many return statements).
    """
    s2_stop = '*' in s2
    s1 = s1.rstrip('*')
    s2 = s2.rstrip('*')

    if s1 == s2:
        # Nothing happened.
        return ('p.(=)', 0, 0, 0)

    lcp = len(longest_common_prefix(s1, s2))
    lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:]))
    s1_end = len(s1) - lcs
    s2_end = len(s2) - lcs

    # Insertion / Duplication / Extention.
    if not s1_end - lcp:
        if len(s1) == lcp:
            # http://www.hgvs.org/mutnomen/FAQ.html#nostop
            stop = unicode(abs(len(s1) - len(s2))) if s2_stop else '?'

            return ('p.(*%i%sext*%s)' % \
                    (len(s1) + 1, seq3(s2[len(s1)]), stop),
                    len(s1), len(s1) + 1, len(s2) + (1 if s2_stop else 0))

        ins_length = s2_end - lcp

        if lcp - ins_length >= 0 and s1[lcp - ins_length:lcp] == s2[lcp:s2_end]:
            if ins_length == 1:
                return ('p.(%s%idup)' % \
                        (seq3(s1[lcp - ins_length]), lcp - ins_length + 1),
                        lcp, lcp, lcp + 1)
            return ('p.(%s%i_%s%idup)' % \
                    (seq3(s1[lcp - ins_length]),
                     lcp - ins_length + 1, seq3(s1[lcp - 1]), lcp),
                    lcp, lcp, lcp + ins_length)
        #if
        return ('p.(%s%i_%s%iins%s)' % \
                (seq3(s1[lcp - 1]), lcp, seq3(s1[lcp]),
                 lcp + 1, seq3(s2[lcp:s2_end])),
                lcp, lcp, s2_end)
    #if

    # Deletion / Inframe stop.
    if not s2_end - lcp:
        if len(s2) == lcp:
            return ('p.(%s%i*)' % (seq3(s1[len(s2)]), len(s2) + 1),
                    lcp, len(s1) + 1, len(s2) + 1)

        if lcp + 1 == s1_end:
            return ('p.(%s%idel)' % (seq3(s1[lcp]), lcp + 1),
                    lcp, lcp + 1, lcp)
        return ('p.(%s%i_%s%idel)' % \
                (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end),
                lcp, s1_end, lcp)
    #if

    # Substitution.
    if s1_end == s2_end and s1_end == lcp + 1:
        return ('p.(%s%i%s)' % (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp])),
                lcp, lcp + 1, lcp + 1)

    # InDel.
    if lcp + 1 == s1_end:
        return ('p.(%s%idelins%s)' % \
                (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp:s2_end])),
                lcp, lcp + 1, s2_end)
    return ('p.(%s%i_%s%idelins%s)' % \
            (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end,
             seq3(s2[lcp:s2_end])),
            lcp, s1_end, s2_end)
Example #35
0
def in_frame_description(s1, s2):
    """
    Give a description of an inframe difference of two proteins. Also give
    the position at which the proteins start to differ and the positions at
    which they are the same again.

        >>> in_frame_description('MTAPQQMT*', 'MTAQQMT*')
        ('p.(Pro4del)', 3, 4, 3)
        >>> in_frame_description('MTAPQQMT*', 'MTAQMT*')
        ('p.(Pro4_Gln5del)', 3, 5, 3)
        >>> in_frame_description('MTAPQQT*', 'MTAQQMT*')
        ('p.(Pro4_Gln6delinsGlnGlnMet)', 3, 6, 6)
        >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ*')
        ('p.(*9Metext*2)', 8, 9, 11)
        >>> in_frame_description('MTAPQQMT*', 'MTAPQQMTMQ')
        ('p.(*9Metext*?)', 8, 9, 10)

    @arg s1: The original protein.
    @type s1: unicode
    @arg s2: The mutated protein.
    @type s2: unicode

    @return: A tuple of:
        - unicode ; Protein description of the change.
        - int     ; First position of the change.
        - int     ; Last position of the change in the first protein.
        - int     ; Last position of the change in the second protein.
    @rtype: tuple(unicode, int, int, int)

    @todo: More intelligently handle longest_common_prefix().
    @todo: Refactor this code (too many return statements).
    """
    s2_stop = '*' in s2
    s1 = s1.rstrip('*')
    s2 = s2.rstrip('*')

    if s1 == s2:
        # Nothing happened.
        return ('p.(=)', 0, 0, 0)

    lcp = len(longest_common_prefix(s1, s2))
    lcs = len(longest_common_suffix(s1[lcp:], s2[lcp:]))
    s1_end = len(s1) - lcs
    s2_end = len(s2) - lcs

    # Insertion / Duplication / Extention.
    if not s1_end - lcp:
        if len(s1) == lcp:
            # http://www.hgvs.org/mutnomen/FAQ.html#nostop
            stop = unicode(abs(len(s1) - len(s2))) if s2_stop else '?'

            return ('p.(*%i%sext*%s)' % \
                    (len(s1) + 1, seq3(s2[len(s1)]), stop),
                    len(s1), len(s1) + 1, len(s2) + (1 if s2_stop else 0))

        ins_length = s2_end - lcp

        if lcp - ins_length >= 0 and s1[lcp -
                                        ins_length:lcp] == s2[lcp:s2_end]:
            if ins_length == 1:
                return ('p.(%s%idup)' % \
                        (seq3(s1[lcp - ins_length]), lcp - ins_length + 1),
                        lcp, lcp, lcp + 1)
            return ('p.(%s%i_%s%idup)' % \
                    (seq3(s1[lcp - ins_length]),
                     lcp - ins_length + 1, seq3(s1[lcp - 1]), lcp),
                    lcp, lcp, lcp + ins_length)
        #if
        return ('p.(%s%i_%s%iins%s)' % \
                (seq3(s1[lcp - 1]), lcp, seq3(s1[lcp]),
                 lcp + 1, seq3(s2[lcp:s2_end])),
                lcp, lcp, s2_end)
    #if

    # Deletion / Inframe stop.
    if not s2_end - lcp:
        if len(s2) == lcp:
            return ('p.(%s%i*)' % (seq3(s1[len(s2)]), len(s2) + 1), lcp,
                    len(s1) + 1, len(s2) + 1)

        if lcp + 1 == s1_end:
            return ('p.(%s%idel)' % (seq3(s1[lcp]), lcp + 1), lcp, lcp + 1,
                    lcp)
        return ('p.(%s%i_%s%idel)' % \
                (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end),
                lcp, s1_end, lcp)
    #if

    # Substitution.
    if s1_end == s2_end and s1_end == lcp + 1:
        return ('p.(%s%i%s)' % (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp])), lcp,
                lcp + 1, lcp + 1)

    # InDel.
    if lcp + 1 == s1_end:
        return ('p.(%s%idelins%s)' % \
                (seq3(s1[lcp]), lcp + 1, seq3(s2[lcp:s2_end])),
                lcp, lcp + 1, s2_end)
    return ('p.(%s%i_%s%idelins%s)' % \
            (seq3(s1[lcp]), lcp + 1, seq3(s1[s1_end - 1]), s1_end,
             seq3(s2[lcp:s2_end])),
            lcp, s1_end, s2_end)
def get_3letter_aa(one_letter_aa):
    return seq3(one_letter_aa, custom_map={"*": "***"}, undef_code='---')
    file.write(
        json.dumps(family))  # Writes dictionary to file for safe keeping. #
file.close()

with open(
        '%s_family_residues.txt' % (args.family), 'w'
) as g:  # Writing catalytic residue information for later in three-letter-code. #
    g.write('ID Res1 Pos1 Res2 Pos2 Res3 Pos3\n')
    for i in family:
        if i in template:
            pass
        else:
            g.write(i)
            for j in family[i]:
                residue = j
                three = seq3(
                    residue[0:1])  # Converts one- to three-letter-code. #
                position = str(residue[1:])
                three_res = str("%s %s" % (three, position))
                g.write(" %s" % (three_res.upper()))
            g.write('\n')
g.close()
os.chdir('../templates/')

#########################PART ii: Catalytic constraints generator#############################

template = []
residue1 = []
residue2 = []
pairing = []
CA1CA2 = []
CB1CB2 = []