def create_logo(self, seqs=[]): """Create sequence logo for input sequences.""" # seperate headers headers, instances = [list(x) for x in zip(*seqs)] if self.options.sequence_type is 'rna': alphabet = Alphabet('ACGU') elif self.options.sequence_type is 'protein': alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY') else: alphabet = Alphabet('AGCT') motif_corebio = SeqList(alist=instances, alphabet=alphabet) data = wbl.LogoData().from_seqs(motif_corebio) format = wbl.LogoFormat(data, self.options) if self.output_format == 'png': return wbl.png_formatter(data, format) elif self.output_format == 'png_print': return wbl.png_print_formatter(data, format) elif self.output_format == 'jpeg': return wbl.jpeg_formatter(data, format) else: return wbl.eps_formatter(data, format)
def __init__(self, alphabet, array=None, typeof=None, name=None, description=None, scale=None): AlphabeticArray.__init__(self, (alphabet, alphabet), array, typeof) self.alphabet = Alphabet(alphabet) self.name = name self.description = description self.scale = scale
def complement(self): """Complement nucleic acid sequence.""" from corebio.seq import Seq, Alphabet alphabet = self.alphabet complement_alphabet = Alphabet(Seq(alphabet, alphabet).complement()) self.alphabets = (None, complement_alphabet) m = self.reindex(alphabet) self.alphabets = (None, alphabet) self.array = m.array
def __init__(self, alphabets, values=None, dtype=None): """ Args: - alphabets -- a list of alphabets (as string or Alphabet objects) to be used to convert strings into indices. The lengths of the alphabets match the shape of the indexed array. Alternatively, an integer or None in the list indicate a non-alphabetic dimension. If None the dimension length is taken from values argument. - values -- An array of values to be indexed. If None a new array is created. If this argument is not a numpy array then the alphabet list must be explicit (cannot contain None.) - dtype -- An optional numpy type code. """ # A dummy object to be used in place of None in the alphabets list # so that we get meaningful error messages if we try to index a # nonalphabetic dimension with a string. class NullAlphabet(object): def ord(self, key): raise IndexError('This dimension does not have an alphabet') def ords(self, key): raise IndexError('This dimension does not have an alphabet') alpha = [] shape = [] for a in alphabets: if isinstance(a, str): a = Alphabet(a) if a is None: shape.append(None) alpha.append(NullAlphabet()) elif isinstance(a, Alphabet): shape.append(len(a)) alpha.append(a) else: shape.append(int(a)) alpha.append(None) shape = tuple(shape) if values is None: values = na.zeros(shape=shape, dtype=dtype) else: values = na.asarray(values, dtype=dtype) vshape = values.shape if len(shape) != len(vshape): raise ValueError("The values array is the wrong shape.") for s1, s2 in zip(shape, vshape): if s1 is not None and s1 != s2: raise ValueError("The values array is the wrong shape.") self.array = values self.alphabets = tuple(alpha)
def read_seq_data(fin, input_parser=seq_io.read, alphabet=None, ignore_lower_case=False, max_file_size=0): """ Read sequence data from the input stream and return a seqs object. The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument. Used to limit the load on the WebLogo webserver. """ max_file_size = int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size)) # If max_file_size is set, or if fin==stdin (which is non-seekable), we # read the data and replace fin with a StringIO object. if (max_file_size > 0): data = fin.read(max_file_size) more_data = fin.read(2) if more_data != "": raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size) fin = StringIO(data) elif fin == sys.stdin: fin = StringIO(fin.read()) fin.seek(0) seqs = input_parser(fin) if seqs is None or len(seqs) == 0: raise ValueError("Please provide a multiple sequence alignment") if ignore_lower_case: # Case is significant. Do not count lower case letters. for i, s in enumerate(seqs): seqs[i] = s.mask() # Add alphabet to seqs. if alphabet: seqs.alphabet = Alphabet(alphabet) else: seqs.alphabet = Alphabet.which(seqs) return seqs
def write_weblogo(self, filepath): matrix_tuple = [] for distribution in self.values: matrix_tuple.append(tuple(distribution)) dataArray = np.array(tuple(matrix_tuple)) alph = Alphabet(''.join(self.alphabet)) weblogoData = LogoData.from_counts(alph, dataArray) weblogoOptions = LogoOptions(color_scheme=classic) weblogoOptions.title = "PWM" weblogoFormat = LogoFormat(weblogoData, weblogoOptions) weblogo_file = open(filepath, 'w') weblogo_file.write(png_print_formatter(weblogoData, weblogoFormat)) weblogo_file.close()
def read_seq_data(fin, input_parser=seq_io.read, alphabet=None, ignore_lower_case=False, max_file_size=0): """ Read sequence data from the input stream and return a seqs object. The environment variable WEBLOGO_MAX_FILE_SIZE overides the max_file_size argument. Used to limit the load on the WebLogo webserver. """ max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size)) # If max_file_size is set, or if fin==stdin (which is non-seekable), we # read the data and replace fin with a StringIO object. if(max_file_size>0) : data = fin.read(max_file_size) more_data = fin.read(2) if more_data != "" : raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size) fin = StringIO(data) elif fin == sys.stdin: fin = StringIO(fin.read()) fin.seek(0) seqs = input_parser(fin) if seqs is None or len(seqs) ==0 : raise ValueError("Please provide a multiple sequence alignment") if ignore_lower_case : # Case is significant. Do not count lower case letters. for i,s in enumerate(seqs) : seqs[i] = s.mask() # Add alphabet to seqs. if alphabet : seqs.alphabet = Alphabet(alphabet) else : seqs.alphabet = Alphabet.which(seqs) return seqs
>>> from corebio.secstruc import * >>> record = dssp.DsspRecord( open('test_corebio/data/1crn.dssp') ) >>> record.secondary() ' EE SSHHHHHHHHHHHTTT HHHHHHHHS EE SSS GGG ' >>> fa_reduce_secstruc_to_ehl(record.secondary()) 'LEELLLHHHHHHHHHHHLLLLLHHHHHHHHLLEELLLLLLLLLLLL' """ __all__ = ['dssp', 'stride','secstruc_alphabet','secstruc_ehl_alphabet', 'fa_reduce_secstruc_to_ehl', 'ehl_reduce_secstruc_to_ehl'] from corebio.seq import Alphabet, Seq from corebio.transform import Transform # ------------------- SECONDARY STRUCTURE ALPHABETS ------------------- secstruc_alphabet = Alphabet("HGIEBbTSC _-L?X") secstruc_ehl_alphabet = Alphabet("EHLX") fa_reduce_secstruc_to_ehl = \ Transform( Seq("HGIEBbTSC _-L?X", secstruc_alphabet), Seq("HLLELLLLLLLLLXX", secstruc_ehl_alphabet) ) ehl_reduce_secstruc_to_ehl = \ Transform( Seq("HGIEBbTSC _-L?X", secstruc_alphabet), Seq("HHHEEELLLLLLLXX", secstruc_ehl_alphabet) )
def read_transfac(cls, fin, alphabet=None): """ Parse a TRANSFAC-format PWM from a file. Returns a Motif object, representing the provided PWM along with an inferred or provided alphabet. """ items = [] start = False for line in fin: if line.isspace() or line[0] == '#': continue stuff = line.split() if stuff[0] == 'PO' or stuff[0] == 'P0': start = True # 'XX' delimiters may precede the first motif if start: if stuff[0] in cls._TRANSFAC_DELIM_LINES: break else: items.append(stuff) if len(items) < 2: raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1 or hcols == cols - 2): raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError("Inconsistant length, row: {}".format(i)) # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True for h in header: if not ischar(h): raise ValueError("Expected a single character per header " "item, but got \"{}\" as one item".format(h)) if not isint(h): position_header = False alphabet_header = False if position_header else True # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]) and r[0][0] != 'P': raise ValueError("Expected position " "as first item on line {}".format(i)) r.pop(0) defacto_alphabet = ''.join(header) else: a = [] for i, r in enumerate(items): if not ischar(r[0]) and r[0][0] != 'P': raise ValueError("Expected position " "as first item on line {}".format(i)) a.append(r.pop(0)) defacto_alphabet = ''.join(a) # check the de facto alphabet, guessing the correct one inferred_alphabet = Alphabet.infer_alphabet(alphabet, Alphabet(defacto_alphabet)) # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros((rows, cols), dtype=na.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() # returns Motif with the de facto alphabet, if alphabet is set to do so return Motif(defacto_alphabet, matrix).reindex(inferred_alphabet)
try: return object.__getattr__(self, name) except AttributeError: return getattr(self.array, name) def __setattr__(self, name, value): try: return object.__setattr__(self, name, value) except AttributeError: return setattr(self.array, name, value) # End class AlphabeticArray # TODO: move to seq? submatrix_alphabet = Alphabet("ARNDCQEGHILKMFPSTWYVBZX") class SubMatrix(AlphabeticArray): """A two dimensional array indexed by an Alphabet. Used to hold substitution matrices and similar information. Various standard substitution matrices are available from the data package >>> from corebio import data >>> mat = SubMatrix.read(data.data_stream('blosum100')) Attr: - alphabet -- An Alphabet - array -- A numpy array - name -- The name of this matrix (if any) as a string. - description -- The description, if any.
def read_transfac( fin, alphabet = None) : """ Parse a sequence matrix from a file. Returns a tuple of (alphabet, matrix) """ items = [] start=True for line in fin : if line.isspace() or line[0] =='#' : continue stuff = line.split() if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue if stuff[0]=='XX' or stuff[0]=='//': break start = False items.append(stuff) if len(items) < 2 : raise ValueError, "Vacuous file." # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not( header[0] == 'PO' or header[0] =='P0' or hcols == cols-1 or hcols == cols-2) : raise ValueError, "Missing header line!" # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)) : if cols != len(items[i]) : raise ValueError, "Inconsistant length, row %d: " % i # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header : if not isint(h) : position_header = False if not str.isalpha(h) : alphabet_header = False if not position_header and not alphabet_header : raise ValueError, "Can't parse header: %s" % str(header) if position_header and alphabet_header : raise ValueError, "Can't parse header" # Check row headers if alphabet_header : for i,r in enumerate(items) : if not isint(r[0]) : raise ValueError, "Expected position as first item on line %d", i r.pop(0) defacto_alphabet = ''.join(header) else : a = [] for i,r in enumerate(items) : if not ischar(r[0]) : raise ValueError, "Expected position as first item on line %d", i a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet : if not defacto_alphabet.alphabetic(alphabet) : raise ValueError, "Incompatible alphabets: %s , %s (defacto)"% ( alphabet, defacto_alphabet) else : alphabets = (unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets : if defacto_alphabet.alphabetic(a) : alphabet = a break if not alphabet : alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) +1 : for r in items : r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros( (rows,cols) , dtype=na.float64) for r in range( rows) : for c in range(cols): matrix[r,c] = float( items[r][c]) if position_header : matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
def read_transfac(fin, alphabet=None): """ Parse a sequence matrix from a file. Returns a tuple of (alphabet, matrix) """ items = [] start = True for line in fin: if line.isspace() or line[0] == '#': continue stuff = line.split() if start and stuff[0] != 'PO' and stuff[0] != 'P0': continue if stuff[0] == 'XX' or stuff[0] == '//': break start = False items.append(stuff) if len(items) < 2: raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0) hcols = len(header) rows = len(items) cols = len(items[0]) if not (header[0] == 'PO' or header[0] == 'P0' or hcols == cols - 1 or hcols == cols - 2): raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)): if cols != len(items[i]): raise ValueError("Inconsistant length, row %d: " % i) # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header: if not isint(h): position_header = False if not str.isalpha(h): alphabet_header = False if not position_header and not alphabet_header: raise ValueError("Can't parse header: %s" % str(header)) if position_header and alphabet_header: raise ValueError("Can't parse header") # Check row headers if alphabet_header: for i, r in enumerate(items): if not isint(r[0]) and r[0][0] != 'P': raise ValueError( "Expected position as first item on line %d" % i) r.pop(0) defacto_alphabet = ''.join(header) else: a = [] for i, r in enumerate(items): if not ischar(r[0]) and r[0][0] != 'P': raise ValueError( "Expected position as first item on line %d" % i) a.append(r.pop(0)) defacto_alphabet = ''.join(a) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet: if not defacto_alphabet.alphabetic(alphabet): raise ValueError("Incompatible alphabets: %s , %s (defacto)" % (alphabet, defacto_alphabet)) else: alphabets = ( unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets: if defacto_alphabet.alphabetic(a): alphabet = a break if not alphabet: alphabet = defacto_alphabet # The last item of each row may be extra cruft. Remove if len(items[0]) == len(header) + 1: for r in items: r.pop() # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros((rows, cols), dtype=na.float64) for r in range(rows): for c in range(cols): matrix[r, c] = float(items[r][c]) if position_header: matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
This module provides an interface to STRIDE, a program used to recognize secondary structural elements in proteins from their atomic coordinates. Refs: - http://wolf.bi.umist.ac.uk/unix/stride.html """ from corebio.seq import Seq, protein_alphabet, Alphabet from corebio.db.astral import to_one_letter_code from subprocess import * from StringIO import StringIO from corebio.utils import stdrepr, find_command # alphabet for stride secondary structure stride_alphabet = Alphabet("HGIEBC12345678@&T") # Dictionary for conversion between names and alphabet stride_alphabet_names = { "H": "AlphaHelix", "G": "310Helix", "I": "PiHelix", "E": "Strand", "b": "Bridge", "B": "Bridge", "C": "Coil", "1": "TurnI", "2": "TurnI'", "3": "TurnII", "4": "TurnII'", "5": "TurnVIa",
def read_swissRegulon( fin, alphabet=None ): """ """ import re items = [] start=True for line in fin : if line.isspace() or line[0] =='#' or re.search('^//$', line): continue stuff = line.split() if start and stuff[0] != 'P0' and stuff[0] != 'PO' : continue start = False items.append(stuff[0:5]) if len(items) < 2 : raise ValueError("Vacuous file.") # Is the first line a header line? header = items.pop(0)[0:5] hcols = len(header) rows = len(items) cols = len(items[0]) if not( header[0] =='P0' or header[0] =='PO' or hcols == cols-1 or hcols == cols-2) : raise ValueError("Missing header line!") # Do all lines (except the first) contain the same number of items? cols = len(items[0]) for i in range(1, len(items)) : if cols != len(items[i]) : raise ValueError("Inconsistant length, row %d: " % i) # Vertical or horizontal arrangement? if header[0] == 'PO' or header[0] == 'P0': header.pop(0) position_header = True alphabet_header = True for h in header : if not isint(h) : position_header = False if not str.isalpha(h) : alphabet_header = False if not position_header and not alphabet_header : raise ValueError("Can't parse header: %s" % str(header)) if position_header and alphabet_header : raise ValueError("Can't parse header") # Check row headers if alphabet_header : for i,r in enumerate(items) : if not isint(r[0]) and r[0][0]!='P' : raise ValueError( "Expected position as first item on line %d" % i) r.pop(0) defacto_alphabet = ''.join(header) # Check defacto_alphabet defacto_alphabet = Alphabet(defacto_alphabet) if alphabet : if not defacto_alphabet.alphabetic(alphabet) : raise ValueError("Incompatible alphabets: %s , %s (defacto)" % (alphabet, defacto_alphabet)) else : alphabets = (unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet, ) for a in alphabets : if defacto_alphabet.alphabetic(a) : alphabet = a break if not alphabet : alphabet = defacto_alphabet # items should now be a list of lists of numbers (as strings) rows = len(items) cols = len(items[0]) matrix = na.zeros( (rows,cols) , dtype=na.float64) for r in range( rows) : for c in range(cols): matrix[r,c] = float( items[r][c]) if position_header : matrix.transpose() return Motif(defacto_alphabet, matrix).reindex(alphabet)
"""Command and control of the program DSSP: Dictionary of protein secondary structure. The program DSSP defines secondary structure, geometrical features and solvent exposure of proteins, given atomic coordinates in Protein Data Bank (PDB) format. See also : - http://swift.cmbi.ru.nl/gv/dssp/ """ from corebio.seq import Seq, protein_alphabet, Alphabet from corebio.utils import stdrepr, find_command from subprocess import * from StringIO import StringIO # alphabet for stride secondary structure dssp_alphabet = Alphabet("HBEGITS ") # Dictionary for conversion between alphabet and secondary structure names dssp_alphabet_names = { 'H' : 'alpha helix', 'B' : 'residue in isolated beta-bridge', 'E' : 'extended strand, participates in beta ladder', 'G' : '3-helix (3/10 helix)', 'I' : '5 helix (pi helix)', 'T' : 'hydrogen bonded turn', 'S' : 'bend', ' ' : 'loop or irregular', } _dssp_header = "==== Secondary Structure Definition by the program DSSP, updated CMBI version by ElmK / April 1,2000"
GeneticCode( 23, "Thraustochytrium Mitochondrial", "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "--------------------------------M--M---------------M------------", "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG", ), ) reduced_protein_alphabets = { # "LiB2": Transform(Seq("CFYWMLIV-GPATSNHQEDRKX*-", std_protein_alphabet), Seq("IIIIIIII-SSSSSSSSSSSSX*-", Alphabet("ISX*-")), "Li et al (2003), table II, group 2"), # "LiB3": Transform(Seq("CFYWMLIV-GPATS-NHQEDRKX*-", std_protein_alphabet), Seq("IIIIIIII-SSSSS-EEEEEEEX*-", Alphabet("ISEX*-")), "Li et al (2003), table II, group 3"), # "LiB4": Transform(Seq("CFYW-MLIV-GPATS-NHQEDRKX*-", std_protein_alphabet), Seq("YYYY-IIII-SSSSS-EEEEEEEX*-", Alphabet("YISEX*-")), "Li et al (2003), table II, group 4"), # "LiB5": Transform(Seq("CFYW-MLIV-G-PATS-NHQEDRKX*-", std_protein_alphabet), Seq("YYYY-IIII-G-SSSS-EEEEEEEX*-", Alphabet("YIGSEX*-")),