def _write_seq(self, record): """Write the sequence. Note that SeqXML requires a DNA, RNA or protein alphabet. """ if isinstance(record.seq, UnknownSeq): raise TypeError( "Sequence type is UnknownSeq but SeqXML requires sequence") seq = str(record.seq) if not len(seq) > 0: raise ValueError("The sequence length should be greater than 0") #Get the base alphabet (underneath any Gapped or StopCodon encoding) alpha = Alphabet._get_base_alphabet(record.seq.alphabet) if isinstance(alpha, Alphabet.RNAAlphabet): seqElem = "RNAseq" elif isinstance(alpha, Alphabet.DNAAlphabet): seqElem = "DNAseq" elif isinstance(alpha, Alphabet.ProteinAlphabet): seqElem = "AAseq" else: raise ValueError("Need a DNA, RNA or Protein alphabet") self.xml_generator.startElement(seqElem, AttributesImpl({})) self.xml_generator.characters(seq) self.xml_generator.endElement(seqElem)
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): """Generator function to parse UniProt XML as SeqRecord objects. parses an XML entry at a time from any UniProt XML file returns a SeqRecord for each iteration This generator can be used in Bio.SeqIO return_raw_comments = True --> comment fields are returned as complete XML to allow further processing skip_parsing_errors = True --> if parsing errors are found, skip to next entry """ if isinstance(alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if isinstance(alphabet, Alphabet.Gapped): if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): raise ValueError("Wrong alphabet %r" % alphabet) if not hasattr(handle, "read"): if isinstance(handle, str): handle = StringIO(handle) else: raise Exception('An XML-containing handler or an XML string must be passed') if ElementTree is None: from SAP.Bio import MissingExternalDependencyError raise MissingExternalDependencyError( "No ElementTree module was found. " "Use Python 2.5+, lxml or elementtree if you " "want to use Bio.SeqIO.UniprotIO.") for event, elem in ElementTree.iterparse(handle, events=("start", "end")): if event == "end" and elem.tag == NS + "entry": yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() elem.clear()
def _guess_consensus_alphabet(self, ambiguous): """Pick an (ungapped) alphabet for an alignment consesus sequence. This just looks at the sequences we have, checks their type, and returns as appropriate type which seems to make sense with the sequences we've got. """ #Start with the (un-gapped version of) the alignment alphabet a = Alphabet._get_base_alphabet(self.alignment._alphabet) #Now check its compatible with all the rest of the sequences for record in self.alignment: #Get the (un-gapped version of) the sequence's alphabet alt = Alphabet._get_base_alphabet(record.seq.alphabet) if not isinstance(alt, a.__class__): raise ValueError("Alignment contains a sequence with \ an incompatible alphabet.") #Check the ambiguous character we are going to use in the consensus #is in the alphabet's list of valid letters (if defined). if hasattr(a, "letters") and a.letters is not None \ and ambiguous not in a.letters: #We'll need to pick a more generic alphabet... if isinstance(a, IUPAC.IUPACUnambiguousDNA): if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: a = IUPAC.IUPACUnambiguousDNA() else: a = Alphabet.generic_dna elif isinstance(a, IUPAC.IUPACUnambiguousRNA): if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: a = IUPAC.IUPACUnambiguousRNA() else: a = Alphabet.generic_rna elif isinstance(a, IUPAC.IUPACProtein): if ambiguous in IUPAC.ExtendedIUPACProtein().letters: a = IUPAC.ExtendedIUPACProtein() else: a = Alphabet.generic_protein else: a = Alphabet.single_letter_alphabet return a
def __init__(self, in_dict, dict_type, alphabet=None): self.alphabet = alphabet if dict_type == COUNT: self.count = in_dict self._freq_from_count() elif dict_type == FREQ: self.count = {} self.update(in_dict) else: raise ValueError("bad dict_type") if not alphabet: self.alphabet = Alphabet.Alphabet() self.alphabet.letters = self._alphabet_from_input()
def __init__(self, data=None, alphabet=None, mat_name='', build_later=0): # User may supply: # data: matrix itself # mat_name: its name. See below. # alphabet: an instance of Bio.Alphabet, or a subclass. If not # supplied, constructor builds its own from that matrix. # build_later: skip the matrix size assertion. User will build the # matrix after creating the instance. Constructor builds a half matrix # filled with zeroes. assert isinstance(mat_name, str) # "data" may be: # 1) None --> then self.data is an empty dictionary # 2) type({}) --> then self takes the items in data # 3) An instance of SeqMat # This whole creation-during-execution is done to avoid changing # default values, the way Python does because default values are # created when the function is defined, not when it is created. if data: try: self.update(data) except ValueError: raise ValueError("Failed to store data in a dictionary") if alphabet is None: alphabet = Alphabet.Alphabet() assert Alphabet.generic_alphabet.contains(alphabet) self.alphabet = alphabet # If passed alphabet is empty, use the letters in the matrix itself if not self.alphabet.letters: self._alphabet_from_matrix() # Assert matrix size: half or full if not build_later: N = len(self.alphabet.letters) assert len(self) == N**2 or len(self) == N*(N+1)/2 self.ab_list = list(self.alphabet.letters) self.ab_list.sort() # Names: a string like "BLOSUM62" or "PAM250" self.mat_name = mat_name if build_later: self._init_zero() else: # Convert full to half self._full_to_half() self._correct_matrix() self.sum_letters = {} self.relative_entropy = 0
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (MultipleSeqAlignment)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict) + 1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein) fssp_align = MultipleSeqAlignment([], alphabet=alpha) for i in sorted(mult_align_dict): fssp_align.append( SeqRecord(Seq(mult_align_dict[i], alpha), sum_dict[i].pdb2 + sum_dict[i].chain2)) return fssp_align
def _classify_alphabet_for_nexus(self, alphabet): """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). Raises an exception if this is not possible.""" #Get the base alphabet (underneath any Gapped or StopCodon encoding) a = Alphabet._get_base_alphabet(alphabet) if not isinstance(a, Alphabet.Alphabet): raise TypeError("Invalid alphabet") elif isinstance(a, Alphabet.ProteinAlphabet): return "protein" elif isinstance(a, Alphabet.DNAAlphabet): return "dna" elif isinstance(a, Alphabet.RNAAlphabet): return "rna" else: #Must be something like NucleotideAlphabet or #just the generic Alphabet (default for fasta files) raise ValueError("Need a DNA, RNA or Protein alphabet")
def _append(self, record, expected_length=None): """Helper function (PRIVATE).""" if not isinstance(record, SeqRecord): raise TypeError("New sequence is not a SeqRecord object") #Currently the get_alignment_length() call is expensive, so we need #to avoid calling it repeatedly for __init__ and extend, hence this #private _append method if expected_length is not None and len(record) != expected_length: #TODO - Use the following more helpful error, but update unit tests #raise ValueError("New sequence is not of length %i" \ # % self.get_alignment_length()) raise ValueError("Sequences must all be the same length") #Using not self.alphabet.contains(record.seq.alphabet) needs fixing #for AlphabetEncoders (e.g. gapped versus ungapped). if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]): raise ValueError("New sequence's alphabet is incompatible") self._records.append(record)
def _append(self, record, expected_length=None): """Helper function (PRIVATE).""" if not isinstance(record, SeqRecord): raise TypeError("New sequence is not a SeqRecord object") # Currently the get_alignment_length() call is expensive, so we need # to avoid calling it repeatedly for __init__ and extend, hence this # private _append method if expected_length is not None and len(record) != expected_length: # TODO - Use the following more helpful error, but update unit tests # raise ValueError("New sequence is not of length %i" \ # % self.get_alignment_length()) raise ValueError("Sequences must all be the same length") # Using not self.alphabet.contains(record.seq.alphabet) needs fixing # for AlphabetEncoders (e.g. gapped versus ungapped). if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]): raise ValueError("New sequence's alphabet is incompatible") self._records.append(record)
def get_alphabet(self): alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) if self.mol_seq and self.mol_seq.is_aligned: return Alphabet.Gapped(alph) return alph
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False, monoisotopic=False): """Calculates the molecular weight of a DNA, RNA or protein sequence. Only unambiguous letters are allowed. Nucleotide sequences are assumed to have a 5' phosphate. seq: String or Biopython sequence object. seq_type: The default (None) is to take the alphabet from the seq argument, or assume DNA if the seq argument is a string. Override this with a string 'DNA', 'RNA', or 'protein'. double_stranded: Calculate the mass for the double stranded molecule? circular: Is the molecule circular (has no ends)? monoisotopic: Use the monoisotopic mass tables? Note that for backwards compatibility, if the seq argument is a string, or Seq object with a generic alphabet, and no seq_type is specified (i.e. left as None), then DNA is assumed. >>> print("%0.2f" % molecular_weight("AGC")) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC"))) 949.61 However, it is better to be explicit - for example with strings: >>> print("%0.2f" % molecular_weight("AGC", "DNA")) 949.61 >>> print("%0.2f" % molecular_weight("AGC", "RNA")) 997.61 >>> print("%0.2f" % molecular_weight("AGC", "protein")) 249.29 Or, with the sequence alphabet: >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.Alphabet import generic_dna, generic_rna, generic_protein >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna))) 949.61 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna))) 997.61 >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein))) 249.29 Also note that contradictory sequence alphabets and seq_type will also give an exception: >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.Alphabet import generic_dna >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA")) Traceback (most recent call last): ... ValueError: seq_type='RNA' contradicts DNA from seq alphabet """ # Rewritten by Markus Piotrowski, 2014 # Find the alphabet type tmp_type = '' if isinstance(seq, Seq) or isinstance(seq, MutableSeq): base_alphabet = Alphabet._get_base_alphabet(seq.alphabet) if isinstance(base_alphabet, Alphabet.DNAAlphabet): tmp_type = 'DNA' elif isinstance(base_alphabet, Alphabet.RNAAlphabet): tmp_type = 'RNA' elif isinstance(base_alphabet, Alphabet.ProteinAlphabet): tmp_type = 'protein' elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein): tmp_type = 'protein' # Convert to one-letter sequence. Have to use a string for seq1 seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet()) elif not isinstance(base_alphabet, Alphabet.Alphabet): raise TypeError("%s is not a valid alphabet for mass calculations" % base_alphabet) else: tmp_type = "DNA" # backward compatibity if seq_type and tmp_type and tmp_type != seq_type: raise ValueError("seq_type=%r contradicts %s from seq alphabet" % (seq_type, tmp_type)) seq_type = tmp_type elif isinstance(seq, str): if seq_type is None: seq_type = "DNA" # backward compatibity else: raise TypeError("Expected a string or Seq object, not seq=%r" % seq) seq = ''.join(str(seq).split()).upper() # Do the minimum formatting if seq_type == 'DNA': if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_dna_weights else: weight_table = IUPACData.unambiguous_dna_weights elif seq_type == 'RNA': if monoisotopic: weight_table = IUPACData.monoisotopic_unambiguous_rna_weights else: weight_table = IUPACData.unambiguous_rna_weights elif seq_type == 'protein': if monoisotopic: weight_table = IUPACData.monoisotopic_protein_weights else: weight_table = IUPACData.protein_weights else: raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" % seq_type) if monoisotopic: water = 18.010565 else: water = 18.0153 try: weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water if circular: weight -= water except KeyError as e: raise ValueError('%s is not a valid unambiguous letter for %s' %(e, seq_type)) except: raise if seq_type in ('DNA', 'RNA') and double_stranded: seq = str(Seq(seq).complement()) weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water if circular: weight -= water elif seq_type == 'protein' and double_stranded: raise ValueError('double-stranded proteins await their discovery') return weight
def information_content(self, start = 0, end = None, e_freq_table = None, log_base = 2, chars_to_ignore = []): """Calculate the information content for each residue along an alignment. Arguments: o start, end - The starting an ending points to calculate the information content. These points should be relative to the first sequence in the alignment, starting at zero (ie. even if the 'real' first position in the seq is 203 in the initial sequence, for the info content, we need to use zero). This defaults to the entire length of the first sequence. o e_freq_table - A FreqTable object specifying the expected frequencies for each letter in the alphabet we are using (e.g. {'G' : 0.4, 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be included, since these should not have expected frequencies. o log_base - The base of the logathrim to use in calculating the information content. This defaults to 2 so the info is in bits. o chars_to_ignore - A listing of characterw which should be ignored in calculating the info content. Returns: o A number representing the info content for the specified region. Please see the Biopython manual for more information on how information content is calculated. """ # if no end was specified, then we default to the end of the sequence if end is None: end = len(self.alignment._records[0].seq) if start < 0 or end > len(self.alignment._records[0].seq): raise ValueError("Start (%s) and end (%s) are not in the \ range %s to %s" % (start, end, 0, len(self.alignment._records[0].seq))) # determine random expected frequencies, if necessary random_expected = None if not e_freq_table: #TODO - What about ambiguous alphabets? base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) if isinstance(base_alpha, Alphabet.ProteinAlphabet): random_expected = Protein20Random elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): random_expected = Nucleotide4Random else: errstr = "Error in alphabet: not Nucleotide or Protein, " errstr += "supply expected frequencies" raise ValueError(errstr) del base_alpha elif not isinstance(e_freq_table, FreqTable.FreqTable): raise ValueError("e_freq_table should be a FreqTable object") # determine all of the letters we have to deal with all_letters = self._get_all_letters() for char in chars_to_ignore: all_letters = all_letters.replace(char, '') info_content = {} for residue_num in range(start, end): freq_dict = self._get_letter_freqs(residue_num, self.alignment._records, all_letters, chars_to_ignore) # print freq_dict, column_score = self._get_column_info_content(freq_dict, e_freq_table, log_base, random_expected) info_content[residue_num] = column_score # sum up the score total_info = sum(info_content.values()) # fill in the ic_vector member: holds IC for each column for i in info_content: self.ic_vector[i] = info_content[i] return total_info
def __init__(self, records, alphabet=None, annotations=None): """Initialize a new MultipleSeqAlignment object. Arguments: - records - A list (or iterator) of SeqRecord objects, whose sequences are all the same length. This may be an be an empty list. - alphabet - The alphabet for the whole alignment, typically a gapped alphabet, which should be a super-set of the individual record alphabets. If omitted, a consensus alphabet is used. - annotations - Information about the whole alignment (dictionary). You would normally load a MSA from a file using Bio.AlignIO, but you can do this from a list of SeqRecord objects too: >>> from SAP.Bio.Alphabet import generic_dna >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.SeqRecord import SeqRecord >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"}) >>> print(align) DNAAlphabet() alignment with 3 rows and 7 columns AAAACGT Alpha AAA-CGT Beta AAAAGGT Gamma >>> align.annotations {'tool': 'demo'} NOTE - The older Bio.Align.Generic.Alignment class only accepted a single argument, an alphabet. This is still supported via a backwards compatible "hack" so as not to disrupt existing scripts and users, but is deprecated and will be removed in a future release. """ if isinstance(records, Alphabet.Alphabet) \ or isinstance(records, Alphabet.AlphabetEncoder): if alphabet is None: #TODO - Remove this backwards compatible mode! alphabet = records records = [] import warnings from SAP.Bio import BiopythonDeprecationWarning warnings.warn("Invalid records argument: While the old " "Bio.Align.Generic.Alignment class only " "accepted a single argument (the alphabet), the " "newer Bio.Align.MultipleSeqAlignment class " "expects a list/iterator of SeqRecord objects " "(which can be an empty list) and an optional " "alphabet argument", BiopythonDeprecationWarning) else : raise ValueError("Invalid records argument") if alphabet is not None : if not (isinstance(alphabet, Alphabet.Alphabet) or isinstance(alphabet, Alphabet.AlphabetEncoder)): raise ValueError("Invalid alphabet argument") self._alphabet = alphabet else : #Default while we add sequences, will take a consensus later self._alphabet = Alphabet.single_letter_alphabet self._records = [] if records: self.extend(records) if alphabet is None: #No alphabet was given, take a consensus alphabet self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for rec in self._records if rec.seq is not None) # Annotations about the whole alignment if annotations is None: annotations = {} elif not isinstance(annotations, dict): raise TypeError("annotations argument should be a dict") self.annotations = annotations
def __add__(self, other): """Combines to alignments with the same number of rows by adding them. If you have two multiple sequence alignments (MSAs), there are two ways to think about adding them - by row or by column. Using the extend method adds by row. Using the addition operator adds by column. For example, >>> from SAP.Bio.Alphabet import generic_dna >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.SeqRecord import SeqRecord >>> from SAP.Bio.Align import MultipleSeqAlignment >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha") >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta") >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma") >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha") >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta") >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma") >>> left = MultipleSeqAlignment([a1, b1, c1], ... annotations={"tool": "demo", "name": "start"}) >>> right = MultipleSeqAlignment([a2, b2, c2], ... annotations={"tool": "demo", "name": "end"}) Now, let's look at these two alignments: >>> print(left) DNAAlphabet() alignment with 3 rows and 5 columns AAAAC Alpha AAA-C Beta AAAAG Gamma >>> print(right) DNAAlphabet() alignment with 3 rows and 2 columns GT Alpha GT Beta GT Gamma And add them: >>> combined = left + right >>> print(combined) DNAAlphabet() alignment with 3 rows and 7 columns AAAACGT Alpha AAA-CGT Beta AAAAGGT Gamma For this to work, both alignments must have the same number of records (here they both have 3 rows): >>> len(left) 3 >>> len(right) 3 >>> len(combined) 3 The individual rows are SeqRecord objects, and these can be added together. Refer to the SeqRecord documentation for details of how the annotation is handled. This example is a special case in that both original alignments shared the same names, meaning when the rows are added they also get the same name. Any common annotations are preserved, but differing annotation is lost. This is the same behaviour used in the SeqRecord annotations and is designed to prevent accidental propagation of inappropriate values: >>> combined.annotations {'tool': 'demo'} """ if not isinstance(other, MultipleSeqAlignment): raise NotImplementedError if len(self) != len(other): raise ValueError("When adding two alignments they must have the same length" " (i.e. same number or rows)") alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet]) merged = (left + right for left, right in zip(self, other)) # Take any common annotation: annotations = dict() for k, v in self.annotations.items(): if k in other.annotations and other.annotations[k] == v: annotations[k] = v return MultipleSeqAlignment(merged, alpha, annotations)
def __init__(self, records, alphabet=None, annotations=None): """Initialize a new MultipleSeqAlignment object. Arguments: - records - A list (or iterator) of SeqRecord objects, whose sequences are all the same length. This may be an be an empty list. - alphabet - The alphabet for the whole alignment, typically a gapped alphabet, which should be a super-set of the individual record alphabets. If omitted, a consensus alphabet is used. - annotations - Information about the whole alignment (dictionary). You would normally load a MSA from a file using Bio.AlignIO, but you can do this from a list of SeqRecord objects too: >>> from SAP.Bio.Alphabet import generic_dna >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.SeqRecord import SeqRecord >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"}) >>> print(align) DNAAlphabet() alignment with 3 rows and 7 columns AAAACGT Alpha AAA-CGT Beta AAAAGGT Gamma >>> align.annotations {'tool': 'demo'} NOTE - The older Bio.Align.Generic.Alignment class only accepted a single argument, an alphabet. This is still supported via a backwards compatible "hack" so as not to disrupt existing scripts and users, but is deprecated and will be removed in a future release. """ if isinstance(records, Alphabet.Alphabet) or isinstance(records, Alphabet.AlphabetEncoder): if alphabet is None: # TODO - Remove this backwards compatible mode! alphabet = records records = [] import warnings from SAP.Bio import BiopythonDeprecationWarning warnings.warn( "Invalid records argument: While the old " "Bio.Align.Generic.Alignment class only " "accepted a single argument (the alphabet), the " "newer Bio.Align.MultipleSeqAlignment class " "expects a list/iterator of SeqRecord objects " "(which can be an empty list) and an optional " "alphabet argument", BiopythonDeprecationWarning, ) else: raise ValueError("Invalid records argument") if alphabet is not None: if not (isinstance(alphabet, Alphabet.Alphabet) or isinstance(alphabet, Alphabet.AlphabetEncoder)): raise ValueError("Invalid alphabet argument") self._alphabet = alphabet else: # Default while we add sequences, will take a consensus later self._alphabet = Alphabet.single_letter_alphabet self._records = [] if records: self.extend(records) if alphabet is None: # No alphabet was given, take a consensus alphabet self._alphabet = Alphabet._consensus_alphabet( rec.seq.alphabet for rec in self._records if rec.seq is not None ) # Annotations about the whole alignment if annotations is None: annotations = {} elif not isinstance(annotations, dict): raise TypeError("annotations argument should be a dict") self.annotations = annotations
def AbiIterator(handle, alphabet=None, trim=False): """Iterator for the Abi file format. """ # raise exception is alphabet is not dna if alphabet is not None: if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet): raise ValueError( "Invalid alphabet, ABI files do not hold proteins.") if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold RNA.") # raise exception if handle mode is not 'rb' if hasattr(handle, 'mode'): if set('rb') != set(handle.mode.lower()): raise ValueError("ABI files has to be opened in 'rb' mode.") # check if input file is a valid Abi file handle.seek(0) marker = handle.read(4) if not marker: # handle empty file gracefully raise StopIteration if marker != _as_bytes('ABIF'): raise IOError('File should start ABIF, not %r' % marker) # dirty hack for handling time information times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', } # initialize annotations annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) # parse header and extract data from directories header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT))) for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): # stop iteration if all desired tags have been extracted # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3, # and seq, qual, id # todo key = tag_name + str(tag_number) # PBAS2 is base-called sequence if key == 'PBAS2': seq = tag_data ambigs = 'KYWMRS' if alphabet is None: if set(seq).intersection(ambigs): alphabet = ambiguous_dna else: alphabet = unambiguous_dna # PCON2 is quality values of base-called sequence elif key == 'PCON2': qual = [ord(val) for val in tag_data] # SMPL1 is sample id entered before sequencing run elif key == 'SMPL1': sample_id = tag_data elif key in times: times[key] = tag_data else: # extract sequence annotation as defined in _EXTRACT if key in _EXTRACT: annot[_EXTRACT[key]] = tag_data # set time annotations annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1']) annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2']) # use the file name as SeqRecord.name if available try: file_name = basename(handle.name).replace('.ab1', '') except: file_name = "" record = SeqRecord(Seq(seq, alphabet), id=sample_id, name=file_name, description='', annotations=annot, letter_annotations={'phred_quality': qual}) if not trim: yield record else: yield _abi_trim(record)
def information_content(self, start=0, end=None, e_freq_table=None, log_base=2, chars_to_ignore=[]): """Calculate the information content for each residue along an alignment. Arguments: o start, end - The starting an ending points to calculate the information content. These points should be relative to the first sequence in the alignment, starting at zero (ie. even if the 'real' first position in the seq is 203 in the initial sequence, for the info content, we need to use zero). This defaults to the entire length of the first sequence. o e_freq_table - A FreqTable object specifying the expected frequencies for each letter in the alphabet we are using (e.g. {'G' : 0.4, 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be included, since these should not have expected frequencies. o log_base - The base of the logathrim to use in calculating the information content. This defaults to 2 so the info is in bits. o chars_to_ignore - A listing of characterw which should be ignored in calculating the info content. Returns: o A number representing the info content for the specified region. Please see the Biopython manual for more information on how information content is calculated. """ # if no end was specified, then we default to the end of the sequence if end is None: end = len(self.alignment._records[0].seq) if start < 0 or end > len(self.alignment._records[0].seq): raise ValueError( "Start (%s) and end (%s) are not in the \ range %s to %s" % (start, end, 0, len(self.alignment._records[0].seq))) # determine random expected frequencies, if necessary random_expected = None if not e_freq_table: #TODO - What about ambiguous alphabets? base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) if isinstance(base_alpha, Alphabet.ProteinAlphabet): random_expected = Protein20Random elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): random_expected = Nucleotide4Random else: errstr = "Error in alphabet: not Nucleotide or Protein, " errstr += "supply expected frequencies" raise ValueError(errstr) del base_alpha elif not isinstance(e_freq_table, FreqTable.FreqTable): raise ValueError("e_freq_table should be a FreqTable object") # determine all of the letters we have to deal with all_letters = self._get_all_letters() for char in chars_to_ignore: all_letters = all_letters.replace(char, '') info_content = {} for residue_num in range(start, end): freq_dict = self._get_letter_freqs(residue_num, self.alignment._records, all_letters, chars_to_ignore) # print freq_dict, column_score = self._get_column_info_content( freq_dict, e_freq_table, log_base, random_expected) info_content[residue_num] = column_score # sum up the score total_info = sum(info_content.values()) # fill in the ic_vector member: holds IC for each column for i in info_content: self.ic_vector[i] = info_content[i] return total_info
def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False): self.entry = elem self.alphabet = alphabet self.return_raw_comments = return_raw_comments
consensus = summary.gap_consensus(ambiguous="N") print(consensus) print("") print( summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus)) print("") #Have a generic alphabet, without a declared gap char, so must tell #provide the frequencies and chars to ignore explicitly. print( summary.information_content(e_freq_table=expected, chars_to_ignore=['-'])) print("") print("Trying a protein sequence with gaps and stops") alpha = Alphabet.HasStopCodon( Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print(a) print("=" * a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print(c) c = s.gap_consensus(ambiguous="X") print(c) print("") print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c))
def AbiIterator(handle, alphabet=None, trim=False): """Iterator for the Abi file format. """ # raise exception is alphabet is not dna if alphabet is not None: if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold proteins.") if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet): raise ValueError("Invalid alphabet, ABI files do not hold RNA.") # raise exception if handle mode is not 'rb' if hasattr(handle, "mode"): if set("rb") != set(handle.mode.lower()): raise ValueError("ABI files has to be opened in 'rb' mode.") # check if input file is a valid Abi file handle.seek(0) marker = handle.read(4) if not marker: # handle empty file gracefully raise StopIteration if marker != _as_bytes("ABIF"): raise IOError("File should start ABIF, not %r" % marker) # dirty hack for handling time information times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""} # initialize annotations annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) # parse header and extract data from directories header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT))) for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): # stop iteration if all desired tags have been extracted # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3, # and seq, qual, id # todo key = tag_name + str(tag_number) # PBAS2 is base-called sequence if key == "PBAS2": seq = tag_data ambigs = "KYWMRS" if alphabet is None: if set(seq).intersection(ambigs): alphabet = ambiguous_dna else: alphabet = unambiguous_dna # PCON2 is quality values of base-called sequence elif key == "PCON2": qual = [ord(val) for val in tag_data] # SMPL1 is sample id entered before sequencing run elif key == "SMPL1": sample_id = tag_data elif key in times: times[key] = tag_data else: # extract sequence annotation as defined in _EXTRACT if key in _EXTRACT: annot[_EXTRACT[key]] = tag_data # set time annotations annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"]) annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"]) # use the file name as SeqRecord.name if available try: file_name = basename(handle.name).replace(".ab1", "") except: file_name = "" record = SeqRecord( Seq(seq, alphabet), id=sample_id, name=file_name, description="", annotations=annot, letter_annotations={"phred_quality": qual}, ) if not trim: yield record else: yield _abi_trim(record)
def __add__(self, other): """Combines to alignments with the same number of rows by adding them. If you have two multiple sequence alignments (MSAs), there are two ways to think about adding them - by row or by column. Using the extend method adds by row. Using the addition operator adds by column. For example, >>> from SAP.Bio.Alphabet import generic_dna >>> from SAP.Bio.Seq import Seq >>> from SAP.Bio.SeqRecord import SeqRecord >>> from SAP.Bio.Align import MultipleSeqAlignment >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha") >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta") >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma") >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha") >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta") >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma") >>> left = MultipleSeqAlignment([a1, b1, c1], ... annotations={"tool": "demo", "name": "start"}) >>> right = MultipleSeqAlignment([a2, b2, c2], ... annotations={"tool": "demo", "name": "end"}) Now, let's look at these two alignments: >>> print(left) DNAAlphabet() alignment with 3 rows and 5 columns AAAAC Alpha AAA-C Beta AAAAG Gamma >>> print(right) DNAAlphabet() alignment with 3 rows and 2 columns GT Alpha GT Beta GT Gamma And add them: >>> combined = left + right >>> print(combined) DNAAlphabet() alignment with 3 rows and 7 columns AAAACGT Alpha AAA-CGT Beta AAAAGGT Gamma For this to work, both alignments must have the same number of records (here they both have 3 rows): >>> len(left) 3 >>> len(right) 3 >>> len(combined) 3 The individual rows are SeqRecord objects, and these can be added together. Refer to the SeqRecord documentation for details of how the annotation is handled. This example is a special case in that both original alignments shared the same names, meaning when the rows are added they also get the same name. Any common annotations are preserved, but differing annotation is lost. This is the same behaviour used in the SeqRecord annotations and is designed to prevent accidental propagation of inappropriate values: >>> combined.annotations {'tool': 'demo'} """ if not isinstance(other, MultipleSeqAlignment): raise NotImplementedError if len(self) != len(other): raise ValueError("When adding two alignments they must have the same length" " (i.e. same number or rows)") alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet]) merged = (left+right for left, right in zip(self, other)) # Take any common annotation: annotations = dict() for k, v in self.annotations.items(): if k in other.annotations and other.annotations[k] == v: annotations[k] = v return MultipleSeqAlignment(merged, alpha, annotations)