def read_seq_data(lines, input_parser=seq_io.read, alphabet=None, ignore_lower_case=False, max_file_size=0): """ Read sequence data from the input stream and return a seqs object. The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument. Used to limit the load on the WebLogo webserver. """ seqs = input_parser(lines) if seqs is None or len(seqs) ==0 : raise ValueError("Please provide a multiple sequence alignment") if ignore_lower_case : # Case is significant. Do not count lower case letters. for i,s in enumerate(seqs) : seqs[i] = s.mask() # Add alphabet to seqs. if alphabet : seqs.alphabet = alphabet else : seqs.alphabet = Alphabet.which(seqs) return seqs
def read_seq_data(lines, input_parser=seq_io.read, alphabet=None, ignore_lower_case=False, max_file_size=0): """ Read sequence data from the input stream and return a seqs object. The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument. Used to limit the load on the WebLogo webserver. """ seqs = input_parser(lines) if seqs is None or len(seqs) == 0: raise ValueError("Please provide a multiple sequence alignment") if ignore_lower_case: # Case is significant. Do not count lower case letters. for i, s in enumerate(seqs): seqs[i] = s.mask() # Add alphabet to seqs. if alphabet: seqs.alphabet = alphabet else: seqs.alphabet = Alphabet.which(seqs) return seqs
def complement(self): """Complement nucleic acid sequence.""" from corebio.seq import Seq, Alphabet alphabet = self.alphabet complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement()) self.alphabets = (None, complement_alphabet) m = self.reindex(alphabet) self.alphabets = (None, alphabet) self.array = m.array
def __init__(self, alphabets, values=None, dtype=None): """ Args: - alphabets -- a list of alphabets (as string or Alphabet objects) to be used to convert strings into indices. The lengths of the alphabets match the shape of the indexed array. Alternatively, an integer or None in the list indicate a non-alphabetic dimension. If None the dimension length is taken from values argument. - values -- An array of values to be indexed. If None a new array is created. If this argument is not a numpy array then the alphabet list must be explicit (cannot contain None.) - dtype -- An optional numpy type code. """ # A dummy object to be used in place of None in the alphabets list # so that we get meaningful error messages if we try to index a # nonalphabetic dimension with a string. class NullAlphabet(object): def ord(self, key): raise IndexError('This dimension does not have an alphabet') def ords(self, key): raise IndexError('This dimension does not have an alphabet') alpha = [] shape = [] for a in alphabets: if isinstance(a, str): a = Alphabet(a) if a is None: shape.append(None) alpha.append(NullAlphabet()) elif isinstance(a, Alphabet): shape.append(len(a)) alpha.append(a) else: shape.append(int(a)) alpha.append(None) shape = tuple(shape) if values is None: values = na.zeros(shape=shape, dtype=dtype) else: values = na.asarray(values, dtype=dtype) vshape = values.shape if len(shape) != len(vshape): raise ValueError("The values array is the wrong shape.") for s1, s2 in izip(shape, vshape): if s1 is not None and s1 != s2: raise ValueError("The values array is the wrong shape.") self.array = values self.alphabets = tuple(alpha)
def __init__(self, alphabet, array=None, typeof=None, name=None, description=None, scale=None): AlphabeticArray.__init__(self, (alphabet, alphabet), array, typeof) self.alphabet = Alphabet(alphabet) self.name = name self.description = description self.scale = scale
def read_transfac( fin, alphabet = None) : """ Parse a sequence matrix from a file. Returns a tuple of (alphabet, matrix) """ items = [] start=True for line in fin : if line.isspace() or line[0] =='#' : continue stuff = line.split() if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue if stuff[0]=='XX' or stuff[0]=='//': break start = False items.append(stuff) if len(items) < 2 : raise ValueError, "Vacuous file." # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not( header[0] == 'PO' or header[0] =='P0' or hcols == cols-1 or hcols == cols-2) : raise ValueError, "Missing header line!" # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)) : if cols != len(items[i]) : raise ValueError, "Inconsistant length, row %d: " % i # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header : if not isint(h) : position_header = False if not str.isalpha(h) : alphabet_header = False if not position_header and not alphabet_header : raise ValueError, "Can't parse header: %s" % str(header) if position_header and alphabet_header : raise ValueError, "Can't parse header" # Check row headers if alphabet_header : for i,r in enumerate(items) : if not isint(r[0]) : raise ValueError, "Expected position as first item on line %d", i r.pop(0) defacto_alphabet = ''.join(header) else : a = [] for i,r in enumerate(items) : if not ischar(r[0]) : raise ValueError, "Expected position as first item on line %d", i a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet : if not defacto_alphabet.alphabetic(alphabet) : raise ValueError, "Incompatible alphabets: %s , %s (defacto)"% ( alphabet, defacto_alphabet) else : alphabets = (unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets : if defacto_alphabet.alphabetic(a) : alphabet = a break if not alphabet : alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) +1 : for r in items : r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros( (rows,cols) , dtype=na.float64) for r in range( rows) : for c in range(cols): matrix[r,c] = float( items[r][c]) if position_header : matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
def read_transfac(fin, alphabet=None): """ Parse a sequence matrix from a file. Returns a tuple of (alphabet, matrix) """ items = [] start = True for line in fin: if line.isspace() or line[0] == '#': continue stuff = line.split() if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue if stuff[0] == 'XX' or stuff[0] == '//': break start = False items.append(stuff) if len(items) < 2: raise ValueError, "Vacuous file." # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1 or hcols == cols - 2): raise ValueError, "Missing header line!" # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError, "Inconsistant length, row %d: " % i # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header: if not isint(h): position_header = False if not str.isalpha(h): alphabet_header = False if not position_header and not alphabet_header: raise ValueError, "Can't parse header: %s" % str(header) if position_header and alphabet_header: raise ValueError, "Can't parse header" # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]): raise ValueError, "Expected position as first item on line %d", i r.pop(0) defacto_alphabet = ''.join(header) else: a = [] for i, r in enumerate(items): if not ischar(r[0]): raise ValueError, "Expected position as first item on line %d", i a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet: if not defacto_alphabet.alphabetic(alphabet): raise ValueError, "Incompatible alphabets: %s , %s (defacto)" % ( alphabet, defacto_alphabet) else: alphabets = ( unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets: if defacto_alphabet.alphabetic(a): alphabet = a break if not alphabet: alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros((rows, cols), dtype=na.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
try: return object.__getattr__(self, name) except AttributeError: return getattr(self.array, name) def __setattr__(self, name, value): try: return object.__setattr__(self, name, value) except AttributeError: return setattr(self.array, name, value) # End class AlphabeticArray #TODO: move to seq? submatrix_alphabet = Alphabet("ARNDCQEGHILKMFPSTWYVBZX") class SubMatrix(AlphabeticArray): """A two dimensional array indexed by an Alphabet. Used to hold substitution matrices and similar information. Various standard substitution matrices are available from the data package >>> from corebio import data >>> mat = SubMatrix.read(data.data_stream('blosum100')) Attr: - alphabet -- An Alphabet - array -- A numpy array - name -- The name of this matrix (if any) as a string. - description -- The description, if any.