Ejemplo n.º 1
0
def read_seq_data(lines, 
                input_parser=seq_io.read, 
                alphabet=None, 
                ignore_lower_case=False, 
                max_file_size=0):
    """ Read sequence data from the input stream and return a seqs object. 
    
    The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument.
    Used to limit the load on the WebLogo webserver.
    """

    seqs = input_parser(lines)

    if seqs is None or len(seqs) ==0 :
        raise ValueError("Please provide a multiple sequence alignment")
    
    if ignore_lower_case :
        # Case is significant. Do not count lower case letters.
        for i,s in enumerate(seqs) :
            seqs[i] = s.mask()

    # Add alphabet to seqs.
    if alphabet :
        seqs.alphabet = alphabet 
    else :
        seqs.alphabet = Alphabet.which(seqs)
    return seqs
Ejemplo n.º 2
0
def read_seq_data(lines,
                  input_parser=seq_io.read,
                  alphabet=None,
                  ignore_lower_case=False,
                  max_file_size=0):
    """ Read sequence data from the input stream and return a seqs object. 
    
    The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument.
    Used to limit the load on the WebLogo webserver.
    """

    seqs = input_parser(lines)

    if seqs is None or len(seqs) == 0:
        raise ValueError("Please provide a multiple sequence alignment")

    if ignore_lower_case:
        # Case is significant. Do not count lower case letters.
        for i, s in enumerate(seqs):
            seqs[i] = s.mask()

    # Add alphabet to seqs.
    if alphabet:
        seqs.alphabet = alphabet
    else:
        seqs.alphabet = Alphabet.which(seqs)
    return seqs
Ejemplo n.º 3
0
    def complement(self):
        """Complement nucleic acid sequence."""
        from corebio.seq import Seq, Alphabet
        alphabet = self.alphabet
        complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement())
        self.alphabets = (None, complement_alphabet)

        m = self.reindex(alphabet)
        self.alphabets = (None, alphabet)
        self.array = m.array
Ejemplo n.º 4
0
    def __init__(self, alphabets, values=None, dtype=None):
        """
        Args:
        - alphabets -- a list of alphabets (as string or Alphabet objects) to
                    be used to convert strings into indices. The lengths of 
                    the alphabets match the shape of the indexed array. 
                    Alternatively, an integer or None in the list indicate a 
                    non-alphabetic dimension. If None the dimension length is 
                    taken from values argument.
        - values -- An array of values to be indexed. If None a new  
                 array is created. If this argument is not a numpy array
                 then the alphabet list must be explicit (cannot contain 
                 None.)
        - dtype -- An optional numpy type code.
        """

        # A dummy object to be used in place of None in the alphabets list
        # so that we get meaningful error messages if we try to index a
        # nonalphabetic dimension with a string.
        class NullAlphabet(object):
            def ord(self, key):
                raise IndexError('This dimension does not have an alphabet')

            def ords(self, key):
                raise IndexError('This dimension does not have an alphabet')

        alpha = []
        shape = []
        for a in alphabets:
            if isinstance(a, str): a = Alphabet(a)

            if a is None:
                shape.append(None)
                alpha.append(NullAlphabet())
            elif isinstance(a, Alphabet):
                shape.append(len(a))
                alpha.append(a)
            else:
                shape.append(int(a))
                alpha.append(None)

        shape = tuple(shape)
        if values is None:
            values = na.zeros(shape=shape, dtype=dtype)
        else:
            values = na.asarray(values, dtype=dtype)
            vshape = values.shape
            if len(shape) != len(vshape):
                raise ValueError("The values array is the wrong shape.")
            for s1, s2 in izip(shape, vshape):
                if s1 is not None and s1 != s2:
                    raise ValueError("The values array is the wrong shape.")
        self.array = values
        self.alphabets = tuple(alpha)
Ejemplo n.º 5
0
 def __init__(self,
              alphabet,
              array=None,
              typeof=None,
              name=None,
              description=None,
              scale=None):
     AlphabeticArray.__init__(self, (alphabet, alphabet), array, typeof)
     self.alphabet = Alphabet(alphabet)
     self.name = name
     self.description = description
     self.scale = scale
Ejemplo n.º 6
0
    def read_transfac( fin, alphabet = None) :
        """ Parse a sequence matrix from a file. 
        Returns a tuple of (alphabet, matrix)
        """
   
        items = []

        start=True
        for line in fin :
            if line.isspace() or line[0] =='#' : continue
            stuff = line.split()
            if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue
            if stuff[0]=='XX' or stuff[0]=='//': break
            start = False
            items.append(stuff)
        if len(items) < 2  :
            raise ValueError, "Vacuous file."

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not( header[0] == 'PO' or header[0] =='P0' or hcols == cols-1 or hcols == cols-2) :
            raise ValueError, "Missing header line!"

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)) :
            if cols != len(items[i]) :
                raise ValueError, "Inconsistant length, row %d: " % i

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0': header.pop(0)

        position_header = True    
        alphabet_header = True    
        for h in header :
            if not isint(h) : position_header = False
            if not str.isalpha(h) : alphabet_header = False

        if not position_header and not alphabet_header :
            raise ValueError, "Can't parse header: %s" % str(header)

        if position_header and alphabet_header :
            raise ValueError, "Can't parse header"        


        # Check row headers
        if alphabet_header :
            for i,r in enumerate(items) :
                if not isint(r[0]) : 
                    raise ValueError, "Expected position as first item on line %d", i
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else :
            a = []
            for i,r in enumerate(items) :
                if not ischar(r[0]) : 
                    raise ValueError, "Expected position as first item on line %d", i
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)                

        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet :
            if not defacto_alphabet.alphabetic(alphabet) :
                raise ValueError, "Incompatible alphabets: %s , %s (defacto)"% (
                    alphabet, defacto_alphabet)
        else :            
            alphabets = (unambiguous_rna_alphabet,
                        unambiguous_dna_alphabet,                      
                        unambiguous_protein_alphabet,
                      )
            for a in alphabets :
                if defacto_alphabet.alphabetic(a) :
                    alphabet = a
                    break
            if not alphabet :
                alphabet = defacto_alphabet
   

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) +1 :
            for r in items :
                r.pop()

        # items should now be a list of lists of numbers (as strings) 
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros( (rows,cols) , dtype=na.float64) 
        for r in range( rows) :
            for c in range(cols):
                matrix[r,c] = float( items[r][c]) 

        if position_header :
            matrix.transpose() 

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
Ejemplo n.º 7
0
    def read_transfac(fin, alphabet=None):
        """ Parse a sequence matrix from a file. 
        Returns a tuple of (alphabet, matrix)
        """

        items = []

        start = True
        for line in fin:
            if line.isspace() or line[0] == '#': continue
            stuff = line.split()
            if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue
            if stuff[0] == 'XX' or stuff[0] == '//': break
            start = False
            items.append(stuff)
        if len(items) < 2:
            raise ValueError, "Vacuous file."

        # Is the first line a header line?
        header = items.pop(0)
        hcols = len(header)
        rows = len(items)
        cols = len(items[0])
        if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1
                or hcols == cols - 2):
            raise ValueError, "Missing header line!"

        # Do all lines (except the first) contain the same number of items?
        cols = len(items[0])
        for i in range(1, len(items)):
            if cols != len(items[i]):
                raise ValueError, "Inconsistant length, row %d: " % i

        # Vertical or horizontal arrangement?
        if header[0] == 'PO' or header[0] == 'P0': header.pop(0)

        position_header = True
        alphabet_header = True
        for h in header:
            if not isint(h): position_header = False
            if not str.isalpha(h): alphabet_header = False

        if not position_header and not alphabet_header:
            raise ValueError, "Can't parse header: %s" % str(header)

        if position_header and alphabet_header:
            raise ValueError, "Can't parse header"

        # Check row headers
        if alphabet_header:
            for i, r in enumerate(items):
                if not isint(r[0]):
                    raise ValueError, "Expected position as first item on line %d", i
                r.pop(0)
                defacto_alphabet = ''.join(header)
        else:
            a = []
            for i, r in enumerate(items):
                if not ischar(r[0]):
                    raise ValueError, "Expected position as first item on line %d", i
                a.append(r.pop(0))
            defacto_alphabet = ''.join(a)

        # Check defacto_alphabet
        defacto_alphabet = Alphabet(defacto_alphabet)

        if alphabet:
            if not defacto_alphabet.alphabetic(alphabet):
                raise ValueError, "Incompatible alphabets: %s , %s (defacto)" % (
                    alphabet, defacto_alphabet)
        else:
            alphabets = (
                unambiguous_rna_alphabet,
                unambiguous_dna_alphabet,
                unambiguous_protein_alphabet,
            )
            for a in alphabets:
                if defacto_alphabet.alphabetic(a):
                    alphabet = a
                    break
            if not alphabet:
                alphabet = defacto_alphabet

        # The last item of each row may be extra cruft. Remove
        if len(items[0]) == len(header) + 1:
            for r in items:
                r.pop()

        # items should now be a list of lists of numbers (as strings)
        rows = len(items)
        cols = len(items[0])
        matrix = na.zeros((rows, cols), dtype=na.float64)
        for r in range(rows):
            for c in range(cols):
                matrix[r, c] = float(items[r][c])

        if position_header:
            matrix.transpose()

        return Motif(defacto_alphabet, matrix).reindex(alphabet)
Ejemplo n.º 8
0
        try:
            return object.__getattr__(self, name)
        except AttributeError:
            return getattr(self.array, name)

    def __setattr__(self, name, value):
        try:
            return object.__setattr__(self, name, value)
        except AttributeError:
            return setattr(self.array, name, value)


# End class AlphabeticArray

#TODO: move to seq?
submatrix_alphabet = Alphabet("ARNDCQEGHILKMFPSTWYVBZX")


class SubMatrix(AlphabeticArray):
    """A two dimensional array indexed by an Alphabet. Used to hold substitution
    matrices and similar information. 
    
    Various standard substitution matrices are available from the data package
    >>> from corebio import data
    >>> mat = SubMatrix.read(data.data_stream('blosum100'))   
    
    Attr:
    - alphabet     -- An Alphabet
    - array        -- A numpy array
    - name         -- The name of this matrix (if any) as a string.
    - description  -- The description, if any.