def __init__(self, prot_sequence, monoisotopic=False): if prot_sequence.islower(): self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) else: self.sequence = Seq(prot_sequence, IUPAC.protein) self.amino_acids_content = None self.amino_acids_percent = None self.length = len(self.sequence) self.monoisotopic = monoisotopic
def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None): """Generator function to iterate over Fasta records (as SeqRecord objects). handle - input file alphabet - optional alphabet title2ids - A function that, when given the title of the FASTA file (without the beginning >), will return the id, name and description (in that order) for the record as a tuple of strings. If this is not given, then the entire title line will be used as the description, and the first word as the id and name. By default this will act like calling Bio.SeqIO.parse(handle, "fasta") with no custom handling of the title lines: >>> with open("Fasta/dups.fasta") as handle: ... for record in FastaIterator(handle): ... print(record.id) ... alpha beta gamma alpha delta However, you can supply a title2ids function to alter this: >>> def take_upper(title): ... return title.split(None, 1)[0].upper(), "", title >>> with open("Fasta/dups.fasta") as handle: ... for record in FastaIterator(handle, title2ids=take_upper): ... print(record.id) ... ALPHA BETA GAMMA ALPHA DELTA """ if title2ids: for title, sequence in SimpleFastaParser(handle): id, name, descr = title2ids(title) yield SeqRecord(Seq(sequence, alphabet), id=id, name=name, description=descr) else: for title, sequence in SimpleFastaParser(handle): try: first_word = title.split(None, 1)[0] except IndexError: assert not title, repr(title) #Should we use SeqRecord default for no ID? first_word = "" yield SeqRecord(Seq(sequence, alphabet), id=first_word, name=first_word, description=title)
def _from_jaspar_sites(self, stream): """ reads the motif from Jaspar .sites file The instances and pwm are OK. """ while True: ln = stream.readline()# read the header "$>...." if ln=="" or ln[0]!=">": break ln=stream.readline().strip()#read the actual sequence i=0 while ln[i]==ln[i].lower(): i+=1 inst="" while i<len(ln) and ln[i]==ln[i].upper(): inst+=ln[i] i+=1 inst=Seq(inst, self.alphabet) self.add_instance(inst) self.set_mask("*"*len(inst)) return self
def make_instances_from_counts(self): """Creates "fake" instances for a motif created from a count matrix. In case the sums of counts are different for different columnes, the shorter columns are padded with background. """ alpha = "".join(self.alphabet.letters) #col[i] is a column taken from aligned motif instances col = [] self.has_instances = True self.instances = [] s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters) for i in range(self.length): col.append("") for n in self.alphabet.letters: col[i] = col[i] + n*(self.counts[n][i]) if len(col[i]) < s: print("WARNING, column too short %i %i" % (len(col[i]), s)) col[i] += (alpha*s)[:(s-len(col[i]))] #print("column %i, %s" % (i, col[i])) #iterate over instances for i in range(s): inst = "" #start with empty seq for j in range(self.length): #iterate over positions inst += col[j][i] #print("%i %s" % (i,inst) inst = Seq(inst, self.alphabet) self.add_instance(inst) return self.instances
def read(handle, format): alphabet = IUPAC.unambiguous_dna counts = {} if format == "pfm": # reads the motif from Jaspar .pfm file letters = "ACGT" for letter, line in zip(letters, handle): words = line.split() #if there is a letter in the beginning, ignore it if words[0] == letter: words = words[1:] counts[letter] = map(float, words) motif = Motif(alphabet, counts=counts) elif format == "sites": # reads the motif from Jaspar .sites file instances = [] for line in handle: if not line.startswith(">"): break # line contains the header ">...." # now read the actual sequence line = handle.next() instance = "" for c in line.strip(): if c == c.upper(): instance += c instance = Seq(instance, alphabet) instances.append(instance) instances = Instances(instances, alphabet) motif = Motif(alphabet, instances=instances) else: raise ValueError("Unknown format %s" % format) motif.mask = "*" * motif.length return motif
def read(handle): """read(handle)""" record = Record() record.ver = next(handle) record.cmd_line = next(handle) for line in handle: if line.strip() == "": pass elif line[:4] == "Para": record.param_dict = {} elif line[0] == "#": seq_name = line.split("\t")[1] record.seq_dict.append(seq_name) elif "=" in line: par_name = line.split("=")[0].strip() par_value = line.split("=")[1].strip() record.param_dict[par_name] = par_value elif line[:5] == "Input": record.seq_dict = [] elif line[:5] == "Motif": record.current_motif = Motif() record.motifs.append(record.current_motif) record.current_motif.alphabet = IUPAC.unambiguous_dna elif line[:3] == "MAP": record.current_motif.score = float(line.split()[-1]) elif len(line.split("\t")) == 4: seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna) record.current_motif.add_instance(seq) elif "*" in line: record.current_motif.set_mask(line.strip("\n\c")) else: raise ValueError(line) return record
def read(self, input_handle): """Read patterns from the specified handle. """ all_patterns = [] while True: cur_line = input_handle.readline() if not (cur_line): break cur_pattern = cur_line.rstrip() # split up signatures if self.separator in cur_pattern: cur_pattern = tuple(cur_pattern.split(self.separator)) if self._alphabet is not None: # make single patterns (not signatures) into lists, so we # can check signatures and single patterns the same if not isinstance(cur_pattern, tuple): test_pattern = [cur_pattern] else: test_pattern = cur_pattern for pattern_item in test_pattern: pattern_seq = Seq(pattern_item, self._alphabet) if not (_verify_alphabet(pattern_seq)): raise ValueError( "Pattern %s not matching alphabet %s" % (cur_pattern, self._alphabet)) all_patterns.append(cur_pattern) return all_patterns
def __init__(self, instances=[], alphabet=None): from SAP.Bio.Alphabet import IUPAC from SAP.Bio.Seq import Seq self.length = None for instance in instances: if self.length is None: self.length = len(instance) elif self.length != len(instance): message = "All instances should have the same length (%d found, %d expected)" % ( len(instance), self.length) raise ValueError(message) try: a = instance.alphabet except AttributeError: # The instance is a plain string continue if alphabet is None: alphabet = a elif alphabet != a: raise ValueError("Alphabets are inconsistent") if alphabet is None or alphabet.letters is None: # If we didn't get a meaningful alphabet from the instances, # assume it is DNA. alphabet = IUPAC.unambiguous_dna for instance in instances: if not isinstance(instance, Seq): sequence = str(instance) instance = Seq(sequence, alphabet=alphabet) self.append(instance) self.alphabet = alphabet
def _read_sites(handle): """ Read the motif from JASPAR .sites file. """ alphabet = dna instances = [] for line in handle: if not line.startswith(">"): break # line contains the header ">...." # now read the actual sequence line = next(handle) instance = "" for c in line.strip(): if c == c.upper(): instance += c instance = Seq(instance, alphabet) instances.append(instance) instances = motifs.Instances(instances, alphabet) motif = Motif(matrix_id=None, name=None, alphabet=alphabet, instances=instances) motif.mask = "*" * motif.length record = Record() record.append(motif) return record
def _elem_AAseq(self, node, record): """Parse protein sequence.""" if not (node.hasChildNodes() and len(node.firstChild.data) > 0): raise ValueError("Sequence length should be greater than 0.") record.seq = Seq(node.firstChild.data, Alphabet.generic_protein)
def TabIterator(handle, alphabet=single_letter_alphabet): """Iterates over tab separated lines (as SeqRecord objects). Each line of the file should contain one tab only, dividing the line into an identifier and the full sequence. handle - input file alphabet - optional alphabet The first field is taken as the record's .id and .name (regardless of any spaces within the text) and the second field is the sequence. Any blank lines are ignored. """ for line in handle: try: title, seq = line.split("\t") # will fail if more than one tab! except: if line.strip() == "": #It's a blank line, ignore it continue raise ValueError("Each line should have one tab separating the" + " title and sequence, this line has %i tabs: %s" % (line.count("\t"), repr(line))) title = title.strip() seq = seq.strip() # removes the trailing new line yield SeqRecord(Seq(seq, alphabet), id=title, name=title, description="")
def to_generic(self, alphabet): """Retrieve generic alignment object for the given alignment. Instead of the tuples, this returns a MultipleSeqAlignment object from SAP.Bio.Align, through which you can manipulate and query the object. alphabet is the specified alphabet for the sequences in the code (for example IUPAC.IUPACProtein). Thanks to James Casbon for the code. """ #TODO - Switch to new Bio.Align.MultipleSeqAlignment class? seq_parts = [] seq_names = [] parse_number = 0 n = 0 for name, start, seq, end in self.alignment: if name == 'QUERY': # QUERY is the first in each alignment block parse_number += 1 n = 0 if parse_number == 1: # create on first_parse, append on all others seq_parts.append(seq) seq_names.append(name) else: seq_parts[n] += seq n += 1 generic = MultipleSeqAlignment([], alphabet) for (name, seq) in zip(seq_names, seq_parts): generic.append(SeqRecord(Seq(seq, alphabet), name)) return generic
def __init__(self, data='', alphabet=default_codon_alphabet, \ gap_char="-", rf_table=None): # rf_table should be a tuple or list indicating the every # codon position along the sequence. For example: # sequence = 'AAATTTGGGCCAAATTT' # rf_table = (0, 3, 6, 8, 11, 14) # the translated protein sequences will be # AAA TTT GGG GCC AAA TTT # K F G A K F # Notice: rf_table applies to ungapped sequence. If there # are gaps in the sequence, they will be discarded. This # feature ensures the rf_table is independent of where the # codon sequence appears in the alignment Seq.__init__(self, data.upper(), alphabet=alphabet) self.gap_char = gap_char # check the length of the alignment to be a triple if rf_table is None: seq_ungapped = self._data.replace(gap_char, "") assert len(self) % 3 == 0, "Sequence length is not a triple number" self.rf_table = list(filter(lambda x: x%3 == 0, range(len(seq_ungapped)))) # check alphabet # Not use Alphabet._verify_alphabet function because it # only works for single alphabet for i in self.rf_table: if self._data[i:i+3] not in alphabet.letters: raise ValueError("Sequence contain undefined letters from" " alphabet " "({0})! ".format(self._data[i:i+3])) else: #if gap_char in self._data: # assert len(self) % 3 == 0, \ # "Gapped sequence length is not a triple number" assert isinstance(rf_table, (tuple, list)), \ "rf_table should be a tuple or list object" assert all(isinstance(i, int) for i in rf_table), \ "elements in rf_table should be int that specify " \ + "the codon positions of the sequence" seq_ungapped = self._data.replace(gap_char, "") for i in rf_table: if seq_ungapped[i:i+3] not in alphabet.letters: raise ValueError("Sequence contain undefined letters " "from alphabet " "({0})!".format(seq_ungapped[i:i+3])) self.rf_table = rf_table
def _get_signature_dict(self, seq_records, sig_size, max_gap): """Return a dictionary with all signatures and their counts. This internal function does all of the hard work for the find_signatures function. """ if self._alphabet_strict: alphabet = seq_records[0].seq.alphabet else: alphabet = None # loop through all records to find signatures all_sigs = {} for seq_record in seq_records: # if we are working with alphabets, make sure we are consistent if alphabet is not None: assert seq_record.seq.alphabet == alphabet, \ "Working with alphabet %s and got %s" % \ (alphabet, seq_record.seq.alphabet) # now start finding signatures in the sequence largest_sig_size = sig_size * 2 + max_gap for start in range(len(seq_record.seq) - (largest_sig_size - 1)): # find the first part of the signature first_sig = str(seq_record.seq[start:start + sig_size]) # now find all of the second parts of the signature for second in range(start + 1, (start + 1) + max_gap): second_sig = str(seq_record.seq[second:second + sig_size]) # if we are being alphabet strict, make sure both parts # of the sig fall within the specified alphabet if alphabet is not None: first_seq = Seq(first_sig, alphabet) second_seq = Seq(second_sig, alphabet) if _verify_alphabet(first_seq) \ and _verify_alphabet(second_seq): all_sigs = self._add_sig(all_sigs, (first_sig, second_sig)) # if we are not being strict, just add the motif else: all_sigs = self._add_sig(all_sigs, (first_sig, second_sig)) return all_sigs
def getSeqBySid(self, domain): """get the seq record of a given domain from its sid""" if self.db_handle is None: return self.fasta_dict[domain].seq else: cur = self.db_handle.cursor() cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) return Seq(cur.fetchone()[0])
def gap_consensus(self, threshold=.7, ambiguous="X", consensus_alpha=None, require_multiple=0): """Same as dumb_consensus(), but allows gap on the output. Things to do: Let the user define that with only one gap, the result character in consensus is gap. Let the user select gap character, now it takes the same is input. """ # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" consensus = '' # find the length of the consensus we are creating con_len = self.alignment.get_alignment_length() # go through each seq item for n in range(con_len): # keep track of the counts of the different atoms we get atom_dict = {} num_atoms = 0 for record in self.alignment._records: # make sure we haven't run past the end of any sequences # if they are of different lengths if n < len(record.seq): if record.seq[n] not in atom_dict: atom_dict[record.seq[n]] = 1 else: atom_dict[record.seq[n]] += 1 num_atoms += 1 max_atoms = [] max_size = 0 for atom in atom_dict: if atom_dict[atom] > max_size: max_atoms = [atom] max_size = atom_dict[atom] elif atom_dict[atom] == max_size: max_atoms.append(atom) if require_multiple and num_atoms == 1: consensus += ambiguous elif (len(max_atoms) == 1) and ( (float(max_size) / float(num_atoms)) >= threshold): consensus += max_atoms[0] else: consensus += ambiguous # we need to guess a consensus alphabet if one isn't specified if consensus_alpha is None: #TODO - Should we make this into a Gapped alphabet? consensus_alpha = self._guess_consensus_alphabet(ambiguous) return Seq(consensus, consensus_alpha)
def get_sequence(self): """Return the AA sequence as a Seq object. @return: polypeptide sequence @rtype: L{Seq} """ s = "" for res in self: s += SCOPData.protein_letters_3to1.get(res.get_resname(), 'X') seq = Seq(s, generic_protein) return seq
def add_sequence(self, descriptor, sequence, start=None, end=None, weight=1.0): """Add a sequence to the alignment. This doesn't do any kind of alignment, it just adds in the sequence object, which is assumed to be prealigned with the existing sequences. Arguments: - descriptor - The descriptive id of the sequence being added. This will be used as the resulting SeqRecord's .id property (and, for historical compatibility, also the .description property) - sequence - A string with sequence info. - start - You can explicitly set the start point of the sequence. This is useful (at least) for BLAST alignments, which can just be partial alignments of sequences. - end - Specify the end of the sequence, which is important for the same reason as the start. - weight - The weight to place on the sequence in the alignment. By default, all sequences have the same weight. (0.0 => no weight, 1.0 => highest weight) """ new_seq = Seq(sequence, self._alphabet) #We are now effectively using the SeqRecord's .id as #the primary identifier (e.g. in Bio.SeqIO) so we should #populate it with the descriptor. #For backwards compatibility, also store this in the #SeqRecord's description property. new_record = SeqRecord(new_seq, id=descriptor, description=descriptor) # hack! We really need to work out how to deal with annotations # and features in biopython. Right now, I'll just use the # generic annotations dictionary we've got to store the start # and end, but we should think up something better. I don't know # if I'm really a big fan of the LocatableSeq thing they've got # in BioPerl, but I'm not positive what the best thing to do on # this is... if start: new_record.annotations['start'] = start if end: new_record.annotations['end'] = end # another hack to add weight information to the sequence new_record.annotations['weight'] = weight self._records.append(new_record)
def consensus(self): """Returns the consensus sequence of a motif. """ res="" for i in range(self.length): max_f=0 max_n="X" for n in sorted(self[i]): if self[i][n]>max_f: max_f=self[i][n] max_n=n res+=max_n return Seq(res, self.alphabet)
def anticonsensus(self): """returns the least probable pattern to be generated from this motif. """ res="" for i in range(self.length): min_f=10.0 min_n="X" for n in sorted(self[i]): if self[i][n]<min_f: min_f=self[i][n] min_n=n res+=min_n return Seq(res, self.alphabet)
def _read(self, stream): """Reads the motif from the stream (in AlignAce format). the self.alphabet variable must be set beforehand. If the last line contains asterisks it is used for setting mask """ while True: ln = stream.readline() if "*" in ln: self.set_mask(ln.strip("\n\c")) break self.add_instance(Seq(ln.strip(), self.alphabet))
def anticonsensus(self): sequence = "" for i in range(self.length): try: minimum = float("inf") except ValueError: # On Python 2.5 or older that was handled in C code, # and failed on Windows XP 32bit minimum = 1E400 for letter in self.alphabet.letters: count = self[letter][i] if count < minimum: minimum = count sequence_letter = letter sequence += sequence_letter return Seq(sequence, self.alphabet)
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (MultipleSeqAlignment)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict) + 1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein) fssp_align = MultipleSeqAlignment([], alphabet=alpha) for i in sorted(mult_align_dict): fssp_align.append( SeqRecord(Seq(mult_align_dict[i], alpha), sum_dict[i].pdb2 + sum_dict[i].chain2)) return fssp_align
def read(handle): """read(handle)""" record = Record() line = next(handle) record.version = line.strip() line = next(handle) record.command = line.strip() for line in handle: line = line.strip() if line == "": pass elif line[:4] == "Para": record.parameters = {} elif line[0] == "#": seq_name = line.split("\t")[1] record.sequences.append(seq_name) elif "=" in line: par_name, par_value = line.split("=") par_name = par_name.strip() par_value = par_value.strip() record.parameters[par_name] = par_value elif line[:5] == "Input": record.sequences = [] elif line[:5] == "Motif": words = line.split() assert words[0] == "Motif" number = int(words[1]) instances = [] elif line[:3] == "MAP": alphabet = IUPAC.unambiguous_dna instances = Instances(instances, alphabet) motif = Motif(alphabet, instances) motif.score = float(line.split()[-1]) motif.number = number motif.mask = mask record.append(motif) elif len(line.split("\t")) == 4: seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna) instances.append(seq) elif "*" in line: mask = line.strip("\r\n") else: raise ValueError(line) return record
def degenerate_consensus(self): # Following the rules adapted from # D. R. Cavener: "Comparison of the consensus sequence flanking # translational start sites in Drosophila and vertebrates." # Nucleic Acids Research 15(4): 1353-1361. (1987). # The same rules are used by TRANSFAC. degenerate_nucleotide = { 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', 'AC': 'M', 'AG': 'R', 'AT': 'W', 'CG': 'S', 'CT': 'Y', 'GT': 'K', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', 'CGT': 'B', 'ACGT': 'N', } sequence = "" for i in range(self.length): def get(nucleotide): return self[nucleotide][i] nucleotides = sorted(self, key=get, reverse=True) counts = [self[c][i] for c in nucleotides] # Follow the Cavener rules: if counts[0] >= sum(counts[1:]) and counts[0] >= 2 * counts[1]: key = nucleotides[0] elif 4 * sum(counts[:2]) > 3 * sum(counts): key = "".join(sorted(nucleotides[:2])) elif counts[3] == 0: key = "".join(sorted(nucleotides[:3])) else: key = "ACGT" nucleotide = degenerate_nucleotide[key] sequence += nucleotide return Seq(sequence, alphabet=IUPAC.ambiguous_dna)
def _set_seq(self, seq, seq_type): """Checks the given sequence for attribute setting Arguments: seq -- String or SeqRecord to check seq_type -- String of sequence type, must be 'hit' or 'query' """ assert seq_type in ('hit', 'query') if seq is None: return seq # return immediately if seq is None else: if not isinstance(seq, (basestring, SeqRecord)): raise TypeError("%s sequence must be a string or a SeqRecord" " object." % seq_type) # check length if the opposite sequence is not None opp_type = 'hit' if seq_type == 'query' else 'query' opp_seq = getattr(self, '_%s' % opp_type, None) if opp_seq is not None: if len(seq) != len(opp_seq): raise ValueError("Sequence lengths do not match. Expected: " "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, len(seq), seq_type)) seq_id = getattr(self, '%s_id' % seq_type) seq_desc = getattr(self, '%s_description' % seq_type) seq_feats = getattr(self, '%s_features' % seq_type) seq_name = 'aligned %s sequence' % seq_type if isinstance(seq, SeqRecord): seq.id = seq_id seq.description = seq_desc seq.name = seq_name seq.features = seq_feats seq.seq.alphabet = self.alphabet elif isinstance(seq, basestring): seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, description=seq_desc, features=seq_feats) return seq
def add_sequence(self, descriptor, sequence, start = None, end = None, weight = 1.0): """Add a sequence to the alignment (DEPRECATED). The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, since add_sequence is likely to be removed in a future release of Biopython. """ import warnings import Bio warnings.warn("The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) #Should we handle start/end/strand information somehow? What for? #TODO - Should we handle weights somehow? See also AlignInfo code... if start is not None or end is not None or weight != 1.0: raise ValueError("The add_Sequence method is obsolete, and only " "provides limited backwards compatibily. The" "start, end and weight arguments are not " "supported.") self.append(SeqRecord(Seq(sequence, self._alphabet), id = descriptor, description = descriptor))
def _get_motif_dict(self, seq_records, motif_size): """Return a dictionary with information on motifs. This internal function essentially does all of the hard work for finding motifs, and returns a dictionary containing the found motifs and their counts. This is internal so it can be reused by find_motif_differences. """ if self.alphabet_strict: alphabet = seq_records[0].seq.alphabet else: alphabet = None # loop through all records to find the motifs in the sequences all_motifs = {} for seq_record in seq_records: # if we are working with alphabets, make sure we are consistent if alphabet is not None: assert seq_record.seq.alphabet == alphabet, \ "Working with alphabet %s and got %s" % \ (alphabet, seq_record.seq.alphabet) # now start finding motifs in the sequence for start in range(len(seq_record.seq) - (motif_size - 1)): motif = str(seq_record.seq[start:start + motif_size]) # if we are being alphabet strict, make sure the motif # falls within the specified alphabet if alphabet is not None: motif_seq = Seq(motif, alphabet) if _verify_alphabet(motif_seq): all_motifs = self._add_motif(all_motifs, motif) # if we are not being strict, just add the motif else: all_motifs = self._add_motif(all_motifs, motif) return all_motifs
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, monoisotopic=False): """Calculates the molecular weight of a DNA, RNA or protein sequence. Only unambiguous letters are allowed. Nucleotide sequences are assumed to have a 5' phosphate. seq: String or Biopython sequence object. seq_type: The default (None) is to take the alphabet from the seq argument, or assume DNA if the seq argument is a string. Override this with a string 'DNA', 'RNA', or 'protein'. double_stranded: Calculate the mass for the double stranded molecule? circular: Is the molecule circular (has no ends)? monoisotopic: Use the monoisotopic mass tables? Note that for backwards compatibility, if the seq argument is a string, or Seq object with a generic alphabet, and no seq_type is specified (i.e. left as None), then DNA is assumed. >>> print("%0.2f" % molecular_weight("AGC")) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 949.61 However, it is better to be explicit - for example with strings: >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 949.61 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 997.61 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 249.29 Or, with the sequence alphabet: >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.Alphabet import generic_dna, generic_rna, generic_protein >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 997.61 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 249.29 Also note that contradictory sequence alphabets and seq_type will also give an exception: >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.Alphabet import generic_dna >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) Traceback (most recent call last): ... ValueError: seq_type='RNA' contradicts DNA from seq alphabet """ # Rewritten by Markus Piotrowski, 2014 # Find the alphabet type tmp_type = '' if isinstance(seq, Seq) or isinstance(seq, MutableSeq): base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) if isinstance(base_alphabet, Alphabet.DNAAlphabet): tmp_type = 'DNA' elif isinstance(base_alphabet, Alphabet.RNAAlphabet): tmp_type = 'RNA' elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): tmp_type = 'protein' elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): tmp_type = 'protein' # Convert to one-letter sequence. Have to use a string for seq1 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) elif not isinstance(base_alphabet, Alphabet.Alphabet): raise TypeError("%s is not a valid alphabet for mass calculations" % base_alphabet) else: tmp_type = "DNA" # backward compatibity if seq_type and tmp_type and tmp_type != seq_type: raise ValueError("seq_type=%r contradicts %s from seq alphabet" % (seq_type, tmp_type)) seq_type = tmp_type elif isinstance(seq, str): if seq_type is None: seq_type = "DNA" # backward compatibity else: raise TypeError("Expected a string or Seq object, not seq=%r" % seq) seq = ''.join(str(seq).split()).upper() # Do the minimum formatting if seq_type == 'DNA': if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_dna_weights else: weight_table = IUPACData.unambiguous_dna_weights elif seq_type == 'RNA': if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_rna_weights else: weight_table = IUPACData.unambiguous_rna_weights elif seq_type == 'protein': if monoisotopic: weight_table = IUPACData.monoisotopic_protein_weights else: weight_table = IUPACData.protein_weights else: raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" % seq_type) if monoisotopic: water = 18.010565 else: water = 18.0153 try: weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water if circular: weight -= water except KeyError as e: raise ValueError('%s is not a valid unambiguous letter for %s' %(e, seq_type)) except: raise if seq_type in ('DNA', 'RNA') and double_stranded: seq = str(Seq(seq).complement()) weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water if circular: weight -= water elif seq_type == 'protein' and double_stranded: raise ValueError('double-stranded proteins await their discovery') return weight
def to_seqrecord(self): """Create a SeqRecord object from this Sequence instance. The seqrecord.annotations dictionary is packed like so:: { # Sequence attributes with no SeqRecord equivalent: 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': { 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type }, # Sequence.annotations attribute (list of Annotations) 'annotations': [{ 'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': [ ann.confidence.value, ann.confidence.type ], 'properties': [{ 'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref } for prop in ann.properties], } for ann in self.annotations], } """ def clean_dict(dct): """Remove None-valued items from a dictionary.""" return dict( (key, val) for key, val in dct.items() if val is not None) seqrec = SeqRecord( Seq(self.mol_seq.value, self.get_alphabet()), **clean_dict({ 'id': str(self.accession), 'name': self.symbol, 'description': self.name, # 'dbxrefs': None, })) if self.domain_architecture: seqrec.features = [ dom.to_seqfeature() for dom in self.domain_architecture.domains ] # Sequence attributes with no SeqRecord equivalent seqrec.annotations = clean_dict({ 'id_ref': self.id_ref, 'id_source': self.id_source, 'location': self.location, 'uri': self.uri and clean_dict({ 'value': self.uri.value, 'desc': self.uri.desc, 'type': self.uri.type, }), 'annotations': self.annotations and [ clean_dict({ 'ref': ann.ref, 'source': ann.source, 'evidence': ann.evidence, 'type': ann.type, 'confidence': ann.confidence and [ann.confidence.value, ann.confidence.type], 'properties': [ clean_dict({ 'value': prop.value, 'ref': prop.ref, 'applies_to': prop.applies_to, 'datatype': prop.datatype, 'unit': prop.unit, 'id_ref': prop.id_ref }) for prop in ann.properties ], }) for ann in self.annotations ], }) return seqrec
def __next__(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration #Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS'] if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known CLUSTAL header: %s" % (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0] == '(' and word[-1] == ')': word = word[1:-1] if word[0] in '0123456789': version = word break #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None # Used to extract the consensus #Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) #Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": #Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: #No consensus break line = handle.readline() if not line: break # end of file assert line.strip() == "" assert seq_cols is not None #Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "": line = handle.readline() if not line: break # end of file if not line: break # end of file if line.split(None, 1)[0] in known_headers: #Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError( "Identifiers out of order? Got '%s' but expected '%s'" % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % ( seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end #Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() #There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: raise StopIteration if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) for (i, s) in zip(ids, seqs)) alignment = MultipleSeqAlignment(records, self.alphabet) #TODO - Handle alignment annotation better, for now #mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: alignment_length = len(seqs[0]) assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment
class ProteinAnalysis(object): """Class containing methods for protein analysis. The constructor takes two arguments. The first is the protein sequence as a string, which is then converted to a sequence object using the Bio.Seq module. This is done just to make sure the sequence is a protein sequence and not anything else. The second argument is optional. If set to True, the weight of the amino acids will be calculated using their monoisotopic mass (the weight of the most abundant isotopes for each element), instead of the average molecular mass (the averaged weight of all stable isotopes for each element). If set to false (the default value) or left out, the IUPAC average molecular mass will be used for the calculation. """ def __init__(self, prot_sequence, monoisotopic=False): if prot_sequence.islower(): self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) else: self.sequence = Seq(prot_sequence, IUPAC.protein) self.amino_acids_content = None self.amino_acids_percent = None self.length = len(self.sequence) self.monoisotopic = monoisotopic def count_amino_acids(self): """Count standard amino acids, returns a dict. Counts the number times each amino acid is in the protein sequence. Returns a dictionary {AminoAcid:Number}. The return value is cached in self.amino_acids_content. It is not recalculated upon subsequent calls. """ if self.amino_acids_content is None: prot_dic = dict((k, 0) for k in IUPACData.protein_letters) for aa in prot_dic: prot_dic[aa] = self.sequence.count(aa) self.amino_acids_content = prot_dic return self.amino_acids_content def get_amino_acids_percent(self): """Calculate the amino acid content in percentages. The same as count_amino_acids only returns the Number in percentage of entire sequence. Returns a dictionary of {AminoAcid:percentage}. The return value is cached in self.amino_acids_percent. input is the dictionary self.amino_acids_content. output is a dictionary with amino acids as keys. """ if self.amino_acids_percent is None: aa_counts = self.count_amino_acids() percentages = {} for aa in aa_counts: percentages[aa] = aa_counts[aa] / float(self.length) self.amino_acids_percent = percentages return self.amino_acids_percent def molecular_weight(self): """Calculate MW from Protein sequence""" # make local dictionary for speed if self.monoisotopic: water = 18.01 iupac_weights = IUPACData.monoisotopic_protein_weights else: iupac_weights = IUPACData.protein_weights water = 18.02 aa_weights = {} for i in iupac_weights: # remove a molecule of water from the amino acid weight aa_weights[i] = iupac_weights[i] - water total_weight = water # add just one water molecule for the whole sequence for aa in self.sequence: total_weight += aa_weights[aa] return total_weight def aromaticity(self): """Calculate the aromaticity according to Lobry, 1994. Calculates the aromaticity value of a protein according to Lobry, 1994. It is simply the relative frequency of Phe+Trp+Tyr. """ aromatic_aas = 'YWF' aa_percentages = self.get_amino_acids_percent() aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas) return aromaticity def instability_index(self): """Calculate the instability index according to Guruprasad et al 1990. Implementation of the method of Guruprasad et al. 1990 to test a protein for stability. Any value above 40 means the protein is unstable (has a short half life). See: Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). """ index = ProtParamData.DIWV score = 0.0 for i in range(self.length - 1): this, next = self.sequence[i:i+2] dipeptide_value = index[this][next] score += dipeptide_value return (10.0 / self.length) * score def flexibility(self): """Calculate the flexibility according to Vihinen, 1994. No argument to change window size because parameters are specific for a window=9. The parameters used are optimized for determining the flexibility. """ flexibilities = ProtParamData.Flex window_size = 9 weights = [0.25, 0.4375, 0.625, 0.8125, 1] scores = [] for i in range(self.length - window_size): subsequence = self.sequence[i:i+window_size] score = 0.0 for j in range(window_size // 2): front = subsequence[j] back = subsequence[window_size - j - 1] score += (flexibilities[front] + flexibilities[back]) * weights[j] middle = subsequence[window_size // 2 + 1] score += flexibilities[middle] scores.append(score / 5.25) return scores def gravy(self): """Calculate the gravy according to Kyte and Doolittle.""" total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) return total_gravy / self.length def _weight_list(self, window, edge): """Makes a list of relative weight of the window edges compared to the window center. The weights are linear. it actually generates half a list. For a window of size 9 and edge 0.4 you get a list of [0.4, 0.55, 0.7, 0.85]. """ unit = 2 * (1.0 - edge) / (window - 1) weights = [0.0] * (window // 2) for i in range(window // 2): weights[i] = edge + unit * i return weights def protein_scale(self, param_dict, window, edge=1.0): """Compute a profile by any amino acid scale. An amino acid scale is defined by a numerical value assigned to each type of amino acid. The most frequently used scales are the hydrophobicity or hydrophilicity scales and the secondary structure conformational parameters scales, but many other scales exist which are based on different chemical and physical properties of the amino acids. You can set several parameters that control the computation of a scale profile, such as the window size and the window edge relative weight value. WindowSize: The window size is the length of the interval to use for the profile computation. For a window size n, we use the i-(n-1)/2 neighboring residues on each side to compute the score for residue i. The score for residue i is the sum of the scaled values for these amino acids, optionally weighted according to their position in the window. Edge: The central amino acid of the window always has a weight of 1. By default, the amino acids at the remaining window positions have the same weight, but you can make the residue at the center of the window have a larger weight than the others by setting the edge value for the residues at the beginning and end of the interval to a value between 0 and 1. For instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 1.0, 0.7, 0.4. The method returns a list of values which can be plotted to view the change along a protein sequence. Many scales exist. Just add your favorites to the ProtParamData modules. Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl """ # generate the weights # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done # in the loop. weights = self._weight_list(window, edge) scores = [] # the score in each Window is divided by the sum of weights # (* 2 + 1) since the weight list is one sided: sum_of_weights = sum(weights) * 2 + 1 for i in range(self.length - window + 1): subsequence = self.sequence[i:i+window] score = 0.0 for j in range(window // 2): # walk from the outside of the Window towards the middle. # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid try: front = param_dict[subsequence[j]] back = param_dict[subsequence[window - j - 1]] score += weights[j] * front + weights[j] * back except KeyError: sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % (subsequence[j], subsequence[window - j - 1])) # Now add the middle value, which always has a weight of 1. middle = subsequence[window // 2] if middle in param_dict: score += param_dict[middle] else: sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) scores.append(score / sum_of_weights) return scores def isoelectric_point(self): """Calculate the isoelectric point. Uses the module IsoelectricPoint to calculate the pI of a protein. """ aa_content = self.count_amino_acids() ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) return ie_point.pi() def secondary_structure_fraction(self): """Calculate fraction of helix, turn and sheet. Returns a list of the fraction of amino acids which tend to be in Helix, Turn or Sheet. Amino acids in helix: V, I, Y, F, W, L. Amino acids in Turn: N, P, G, S. Amino acids in sheet: E, M, A, L. Returns a tuple of three integers (Helix, Turn, Sheet). """ aa_percentages = self.get_amino_acids_percent() helix = sum(aa_percentages[r] for r in 'VIYFWL') turn = sum(aa_percentages[r] for r in 'NPGS') sheet = sum(aa_percentages[r] for r in 'EMAL') return helix, turn, sheet