def create_logo(self, seqs=[]):
        """Create sequence logo for input sequences."""
        # seperate headers
        headers, instances = [list(x)
                              for x in zip(*seqs)]

        if self.options.sequence_type is 'rna':
            alphabet = Alphabet('ACGU')
        elif self.options.sequence_type is 'protein':
            alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
        else:
            alphabet = Alphabet('AGCT')
        motif_corebio = SeqList(alist=instances, alphabet=alphabet)
        data = wbl.LogoData().from_seqs(motif_corebio)

        format = wbl.LogoFormat(data, self.options)

        if self.output_format == 'png':
            return wbl.png_formatter(data, format)
        elif self.output_format == 'png_print':
            return wbl.png_print_formatter(data, format)
        elif self.output_format == 'jpeg':
            return wbl.jpeg_formatter(data, format)
        else:
            return wbl.eps_formatter(data, format)
Esempio n. 2
0
 def __init__(self, alphabet, array=None, typeof=None, name=None,
              description=None, scale=None):
     AlphabeticArray.__init__(self, (alphabet, alphabet), array, typeof)
     self.alphabet = Alphabet(alphabet)
     self.name = name
     self.description = description
     self.scale = scale
Esempio n. 3
0
    def complement(self):
        """Complement nucleic acid sequence."""
        from corebio.seq import Seq, Alphabet
        alphabet = self.alphabet
        complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement())
        self.alphabets = (None, complement_alphabet)

        m = self.reindex(alphabet)
        self.alphabets = (None, alphabet)
        self.array = m.array
Esempio n. 4
0
    def __init__(self, alphabets, values=None, dtype=None):
        """
        Args:
        - alphabets -- a list of alphabets (as string or Alphabet objects) to
                    be used to convert strings into indices. The lengths of
                    the alphabets match the shape of the indexed array.
                    Alternatively, an integer or None in the list indicate a
                    non-alphabetic dimension. If None the dimension length is
                    taken from values argument.
        - values -- An array of values to be indexed. If None a new
                 array is created. If this argument is not a numpy array
                 then the alphabet list must be explicit (cannot contain
                 None.)
        - dtype -- An optional numpy type code.
        """

        # A dummy object to be used in place of None in the alphabets list
        # so that we get meaningful error messages if we try to index a
        # nonalphabetic dimension with a string.
        class NullAlphabet(object):
            def ord(self, key):
                raise IndexError('This dimension does not have an alphabet')

            def ords(self, key):
                raise IndexError('This dimension does not have an alphabet')

        alpha = []
        shape = []
        for a in alphabets:
            if isinstance(a, str):
                a = Alphabet(a)

            if a is None:
                shape.append(None)
                alpha.append(NullAlphabet())
            elif isinstance(a, Alphabet):
                shape.append(len(a))
                alpha.append(a)
            else:
                shape.append(int(a))
                alpha.append(None)

        shape = tuple(shape)
        if values is None:
            values = na.zeros(shape=shape, dtype=dtype)
        else:
            values = na.asarray(values, dtype=dtype)
            vshape = values.shape
            if len(shape) != len(vshape):
                raise ValueError("The values array is the wrong shape.")
            for s1, s2 in zip(shape, vshape):
                if s1 is not None and s1 != s2:
                    raise ValueError("The values array is the wrong shape.")
        self.array = values
        self.alphabets = tuple(alpha)
Esempio n. 5
0
def read_seq_data(fin,
                  input_parser=seq_io.read,
                  alphabet=None,
                  ignore_lower_case=False,
                  max_file_size=0):
    """ Read sequence data from the input stream and return a seqs object. 
    
    The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument.
    Used to limit the load on the WebLogo webserver.
    """

    max_file_size = int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size))

    # If max_file_size is set, or if fin==stdin (which is non-seekable), we
    # read the data and replace fin with a StringIO object.
    if (max_file_size > 0):
        data = fin.read(max_file_size)
        more_data = fin.read(2)
        if more_data != "":
            raise IOError("File exceeds maximum allowed size: %d bytes" %
                          max_file_size)
        fin = StringIO(data)
    elif fin == sys.stdin:
        fin = StringIO(fin.read())

    fin.seek(0)
    seqs = input_parser(fin)

    if seqs is None or len(seqs) == 0:
        raise ValueError("Please provide a multiple sequence alignment")

    if ignore_lower_case:
        # Case is significant. Do not count lower case letters.
        for i, s in enumerate(seqs):
            seqs[i] = s.mask()

    # Add alphabet to seqs.
    if alphabet:
        seqs.alphabet = Alphabet(alphabet)
    else:
        seqs.alphabet = Alphabet.which(seqs)
    return seqs
Esempio n. 6
0
    def write_weblogo(self, filepath):
        matrix_tuple = []
        for distribution in self.values:
            matrix_tuple.append(tuple(distribution))

        dataArray = np.array(tuple(matrix_tuple))

        alph = Alphabet(''.join(self.alphabet))

        weblogoData = LogoData.from_counts(alph, dataArray)
        weblogoOptions = LogoOptions(color_scheme=classic)
        weblogoOptions.title = "PWM"
        weblogoFormat = LogoFormat(weblogoData, weblogoOptions)
        weblogo_file = open(filepath, 'w')
        weblogo_file.write(png_print_formatter(weblogoData, weblogoFormat))
        weblogo_file.close()
Esempio n. 7
0
def read_seq_data(fin, 
                input_parser=seq_io.read, 
                alphabet=None, 
                ignore_lower_case=False, 
                max_file_size=0):
    """ Read sequence data from the input stream and return a seqs object. 
    
    The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument.
    Used to limit the load on the WebLogo webserver.
    """

    max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size))

    # If max_file_size is set, or if fin==stdin (which is non-seekable), we
    # read the data and replace fin with a StringIO object. 
    if(max_file_size>0) :
        data = fin.read(max_file_size)
        more_data = fin.read(2)
        if more_data != "" :
            raise IOError("File exceeds maximum allowed size: %d bytes"  % max_file_size) 
        fin = StringIO(data)
    elif fin == sys.stdin:
        fin = StringIO(fin.read())

    fin.seek(0)    
    seqs = input_parser(fin)

    if seqs is None or len(seqs) ==0 :
        raise ValueError("Please provide a multiple sequence alignment")
    
    if ignore_lower_case :
        # Case is significant. Do not count lower case letters.
        for i,s in enumerate(seqs) :
            seqs[i] = s.mask()

    # Add alphabet to seqs.
    if alphabet :
        seqs.alphabet = Alphabet(alphabet) 
    else :
        seqs.alphabet = Alphabet.which(seqs)
    return seqs
Esempio n. 8
0
>>> from corebio.secstruc import *
>>> record = dssp.DsspRecord( open('test_corebio/data/1crn.dssp') )
>>> record.secondary()
' EE SSHHHHHHHHHHHTTT  HHHHHHHHS EE SSS   GGG  '
>>> fa_reduce_secstruc_to_ehl(record.secondary())
'LEELLLHHHHHHHHHHHLLLLLHHHHHHHHLLEELLLLLLLLLLLL' 

""" 

__all__ = ['dssp', 'stride','secstruc_alphabet','secstruc_ehl_alphabet', 
    'fa_reduce_secstruc_to_ehl', 'ehl_reduce_secstruc_to_ehl']

from corebio.seq import Alphabet, Seq
from corebio.transform import Transform

# ------------------- SECONDARY STRUCTURE ALPHABETS -------------------
secstruc_alphabet = Alphabet("HGIEBbTSC _-L?X")
secstruc_ehl_alphabet = Alphabet("EHLX")

fa_reduce_secstruc_to_ehl = \
    Transform(  Seq("HGIEBbTSC _-L?X", secstruc_alphabet),
                  Seq("HLLELLLLLLLLLXX", secstruc_ehl_alphabet) )

ehl_reduce_secstruc_to_ehl = \
    Transform( Seq("HGIEBbTSC _-L?X", secstruc_alphabet),
                 Seq("HHHEEELLLLLLLXX", secstruc_ehl_alphabet) )
                 
                 
                 
Esempio n. 9
0
    def read_transfac(cls, fin, alphabet=None):
        """ Parse a TRANSFAC-format PWM from a file.
        Returns a Motif object, representing the provided
        PWM along with an inferred or provided alphabet.
        """
        items = []

        start = False
        for line in fin:
            if line.isspace() or line[0] == '#':
                continue

            stuff = line.split()

            if stuff[0] == 'PO' or stuff[0] == 'P0':
                start = True

            # 'XX' delimiters may precede the first motif
            if start:
                if stuff[0] in cls._TRANSFAC_DELIM_LINES:
                    break
                else:
                    items.append(stuff)

        if len(items) < 2:
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1
                or hcols == cols - 2):
            raise ValueError("Missing header line!")

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)):
            if cols != len(items[i]):
                raise ValueError("Inconsistant length, row: {}".format(i))

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0':
            header.pop(0)

        position_header = True

        for h in header:
            if not ischar(h):
                raise ValueError("Expected a single character per header "
                                 "item, but got \"{}\" as one item".format(h))
            if not isint(h):
                position_header = False

        alphabet_header = False if position_header else True

        # Check row headers
        if alphabet_header:
            for i, r in enumerate(items):
                if not isint(r[0]) and r[0][0] != 'P':
                    raise ValueError("Expected position "
                                     "as first item on line {}".format(i))
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else:
            a = []
            for i, r in enumerate(items):
                if not ischar(r[0]) and r[0][0] != 'P':
                    raise ValueError("Expected position "
                                     "as first item on line {}".format(i))
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)

        # check the de facto alphabet, guessing the correct one
        inferred_alphabet = Alphabet.infer_alphabet(alphabet,
                                                    Alphabet(defacto_alphabet))

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) + 1:
            for r in items:
                r.pop()

        # items should now be a list of lists of numbers (as strings)
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros((rows, cols), dtype=na.float64)
        for r in range(rows):
            for c in range(cols):
                matrix[r, c] = float(items[r][c])

        if position_header:
            matrix.transpose()

        # returns Motif with the de facto alphabet, if alphabet is set to do so
        return Motif(defacto_alphabet, matrix).reindex(inferred_alphabet)
Esempio n. 10
0
        try:
            return object.__getattr__(self, name)
        except AttributeError:
            return getattr(self.array, name)

    def __setattr__(self, name, value):
        try:
            return object.__setattr__(self, name, value)
        except AttributeError:
            return setattr(self.array, name, value)


# End class AlphabeticArray

# TODO: move to seq?
submatrix_alphabet = Alphabet("ARNDCQEGHILKMFPSTWYVBZX")


class SubMatrix(AlphabeticArray):
    """A two dimensional array indexed by an Alphabet. Used to hold substitution
    matrices and similar information.

    Various standard substitution matrices are available from the data package
    >>> from corebio import data
    >>> mat = SubMatrix.read(data.data_stream('blosum100'))

    Attr:
    - alphabet     -- An Alphabet
    - array        -- A numpy array
    - name         -- The name of this matrix (if any) as a string.
    - description  -- The description, if any.
Esempio n. 11
0
    def read_transfac( fin, alphabet = None) :
        """ Parse a sequence matrix from a file. 
        Returns a tuple of (alphabet, matrix)
        """
   
        items = []

        start=True
        for line in fin :
            if line.isspace() or line[0] =='#' : continue
            stuff = line.split()
            if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue
            if stuff[0]=='XX' or stuff[0]=='//': break
            start = False
            items.append(stuff)
        if len(items) < 2  :
            raise ValueError, "Vacuous file."

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not( header[0] == 'PO' or header[0] =='P0' or hcols == cols-1 or hcols == cols-2) :
            raise ValueError, "Missing header line!"

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)) :
            if cols != len(items[i]) :
                raise ValueError, "Inconsistant length, row %d: " % i

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0': header.pop(0)

        position_header = True    
        alphabet_header = True    
        for h in header :
            if not isint(h) : position_header = False
            if not str.isalpha(h) : alphabet_header = False

        if not position_header and not alphabet_header :
            raise ValueError, "Can't parse header: %s" % str(header)

        if position_header and alphabet_header :
            raise ValueError, "Can't parse header"        


        # Check row headers
        if alphabet_header :
            for i,r in enumerate(items) :
                if not isint(r[0]) : 
                    raise ValueError, "Expected position as first item on line %d", i
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else :
            a = []
            for i,r in enumerate(items) :
                if not ischar(r[0]) : 
                    raise ValueError, "Expected position as first item on line %d", i
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)                

        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet :
            if not defacto_alphabet.alphabetic(alphabet) :
                raise ValueError, "Incompatible alphabets: %s , %s (defacto)"% (
                    alphabet, defacto_alphabet)
        else :            
            alphabets = (unambiguous_rna_alphabet,
                        unambiguous_dna_alphabet,                      
                        unambiguous_protein_alphabet,
                      )
            for a in alphabets :
                if defacto_alphabet.alphabetic(a) :
                    alphabet = a
                    break
            if not alphabet :
                alphabet = defacto_alphabet
   

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) +1 :
            for r in items :
                r.pop()

        # items should now be a list of lists of numbers (as strings) 
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros( (rows,cols) , dtype=na.float64) 
        for r in range( rows) :
            for c in range(cols):
                matrix[r,c] = float( items[r][c]) 

        if position_header :
            matrix.transpose() 

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
Esempio n. 12
0
    def read_transfac(fin, alphabet=None):
        """ Parse a sequence matrix from a file. 
        Returns a tuple of (alphabet, matrix)
        """

        items = []

        start = True
        for line in fin:
            if line.isspace() or line[0] == '#':
                continue
            stuff = line.split()
            if start and stuff[0] != 'PO' and stuff[0] != 'P0':
                continue
            if stuff[0] == 'XX' or stuff[0] == '//':
                break
            start = False
            items.append(stuff)
        if len(items) < 2:
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1
                or hcols == cols - 2):
            raise ValueError("Missing header line!")

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)):
            if cols != len(items[i]):
                raise ValueError("Inconsistant length, row %d: " % i)

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0':
            header.pop(0)

        position_header = True
        alphabet_header = True
        for h in header:
            if not isint(h):
                position_header = False
            if not str.isalpha(h):
                alphabet_header = False

        if not position_header and not alphabet_header:
            raise ValueError("Can't parse header: %s" % str(header))

        if position_header and alphabet_header:
            raise ValueError("Can't parse header")

        # Check row headers
        if alphabet_header:
            for i, r in enumerate(items):
                if not isint(r[0]) and r[0][0] != 'P':
                    raise ValueError(
                        "Expected position as first item on line %d" % i)
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else:
            a = []
            for i, r in enumerate(items):
                if not ischar(r[0]) and r[0][0] != 'P':
                    raise ValueError(
                        "Expected position as first item on line %d" % i)
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)

            # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet:
            if not defacto_alphabet.alphabetic(alphabet):
                raise ValueError("Incompatible alphabets: %s , %s (defacto)" %
                                 (alphabet, defacto_alphabet))
        else:
            alphabets = (
                unambiguous_rna_alphabet,
                unambiguous_dna_alphabet,
                unambiguous_protein_alphabet,
            )
            for a in alphabets:
                if defacto_alphabet.alphabetic(a):
                    alphabet = a
                    break
            if not alphabet:
                alphabet = defacto_alphabet

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) + 1:
            for r in items:
                r.pop()

        # items should now be a list of lists of numbers (as strings)
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros((rows, cols), dtype=na.float64)
        for r in range(rows):
            for c in range(cols):
                matrix[r, c] = float(items[r][c])

        if position_header:
            matrix.transpose()

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
Esempio n. 13
0
This module provides an interface to STRIDE, a program used to recognize
secondary structural elements in proteins from their atomic coordinates.

Refs:
    - http://wolf.bi.umist.ac.uk/unix/stride.html
"""

from corebio.seq import Seq, protein_alphabet, Alphabet
from corebio.db.astral import to_one_letter_code
from subprocess import *
from StringIO import StringIO
from corebio.utils import stdrepr, find_command

# alphabet for stride secondary structure
stride_alphabet = Alphabet("HGIEBC12345678@&T")

# Dictionary for conversion between names and alphabet
stride_alphabet_names = {
    "H": "AlphaHelix",
    "G": "310Helix",
    "I": "PiHelix",
    "E": "Strand",
    "b": "Bridge",
    "B": "Bridge",
    "C": "Coil",
    "1": "TurnI",
    "2": "TurnI'",
    "3": "TurnII",
    "4": "TurnII'",
    "5": "TurnVIa",
Esempio n. 14
0
    def read_swissRegulon( fin, alphabet=None ):        
        """
        """
        import re 
        items = []
        start=True
        for line in fin :
            if line.isspace() or line[0] =='#' or re.search('^//$', line): continue
            stuff = line.split()
            if start and stuff[0] != 'P0' and stuff[0] != 'PO' : continue
            start = False
            items.append(stuff[0:5])
        if len(items) < 2  :
            raise ValueError("Vacuous file.")

        # Is the first line a header line?
        header = items.pop(0)[0:5]
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not( header[0] =='P0' or header[0] =='PO' or hcols == cols-1 or hcols == cols-2) :
            raise ValueError("Missing header line!")

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)) :
            if cols != len(items[i]) :
                raise ValueError("Inconsistant length, row %d: " % i)
        
        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0': header.pop(0)

        position_header = True    
        alphabet_header = True    
        for h in header :
            if not isint(h) : position_header = False
            if not str.isalpha(h) : alphabet_header = False

        if not position_header and not alphabet_header :
            raise ValueError("Can't parse header: %s" % str(header))

        if position_header and alphabet_header :
            raise ValueError("Can't parse header")
        
        # Check row headers 
        if alphabet_header :
            for i,r in enumerate(items) :
                if not isint(r[0]) and r[0][0]!='P' : 
                    raise ValueError(
                        "Expected position as first item on line %d" % i)
                r.pop(0)
                defacto_alphabet = ''.join(header)
        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)
        if alphabet :
            if not defacto_alphabet.alphabetic(alphabet) :
                raise ValueError("Incompatible alphabets: %s , %s (defacto)"
                                 % (alphabet, defacto_alphabet))
        else :            
            alphabets = (unambiguous_rna_alphabet,
                        unambiguous_dna_alphabet,                      
                        unambiguous_protein_alphabet,
                      )
            for a in alphabets :
                if defacto_alphabet.alphabetic(a) :
                    alphabet = a
                    break
            if not alphabet :
                alphabet = defacto_alphabet
   
        
        # items should now be a list of lists of numbers (as strings) 
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros( (rows,cols) , dtype=na.float64) 
        for r in range( rows) :
            for c in range(cols):
                matrix[r,c] = float( items[r][c]) 

        if position_header :
            matrix.transpose()
            
        return Motif(defacto_alphabet, matrix).reindex(alphabet)                
Esempio n. 15
0
"""Command and control of the program DSSP: Dictionary of protein secondary structure. The program DSSP defines secondary structure, geometrical features and solvent exposure of proteins, given atomic coordinates in Protein Data Bank (PDB) format.


See also :
- http://swift.cmbi.ru.nl/gv/dssp/

"""

from corebio.seq import Seq, protein_alphabet, Alphabet
from corebio.utils import stdrepr, find_command
from subprocess import * 
from StringIO import StringIO

# alphabet for stride secondary structure
dssp_alphabet = Alphabet("HBEGITS ")

# Dictionary for conversion between alphabet and secondary structure names
dssp_alphabet_names  = {
    'H' : 'alpha helix',
    'B' : 'residue in isolated beta-bridge',
    'E' : 'extended strand, participates in beta ladder',
    'G' : '3-helix (3/10 helix)',
    'I' : '5 helix (pi helix)',
    'T' : 'hydrogen bonded turn',
    'S' : 'bend',
    ' ' : 'loop or irregular', 
    }

_dssp_header = "==== Secondary Structure Definition by the program DSSP, updated CMBI version by ElmK / April 1,2000"
Esempio n. 16
0
    GeneticCode(
        23,
        "Thraustochytrium Mitochondrial",
        "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "--------------------------------M--M---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG",
    ),
)

reduced_protein_alphabets = {
    #
    "LiB2":
    Transform(Seq("CFYWMLIV-GPATSNHQEDRKX*-", std_protein_alphabet),
              Seq("IIIIIIII-SSSSSSSSSSSSX*-", Alphabet("ISX*-")),
              "Li et al (2003), table II, group 2"),
    #
    "LiB3":
    Transform(Seq("CFYWMLIV-GPATS-NHQEDRKX*-", std_protein_alphabet),
              Seq("IIIIIIII-SSSSS-EEEEEEEX*-", Alphabet("ISEX*-")),
              "Li et al (2003), table II, group 3"),
    #
    "LiB4":
    Transform(Seq("CFYW-MLIV-GPATS-NHQEDRKX*-", std_protein_alphabet),
              Seq("YYYY-IIII-SSSSS-EEEEEEEX*-", Alphabet("YISEX*-")),
              "Li et al (2003), table II, group 4"),
    #
    "LiB5":
    Transform(Seq("CFYW-MLIV-G-PATS-NHQEDRKX*-", std_protein_alphabet),
              Seq("YYYY-IIII-G-SSSS-EEEEEEEX*-", Alphabet("YIGSEX*-")),