Ejemplo n.º 1
0
    def _write_seq(self, record):
        """Write the sequence.

        Note that SeqXML requires a DNA, RNA or protein alphabet.
        """

        if isinstance(record.seq, UnknownSeq):
            raise TypeError(
                "Sequence type is UnknownSeq but SeqXML requires sequence")

        seq = str(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alpha, Alphabet.RNAAlphabet):
            seqElem = "RNAseq"
        elif isinstance(alpha, Alphabet.DNAAlphabet):
            seqElem = "DNAseq"
        elif isinstance(alpha, Alphabet.ProteinAlphabet):
            seqElem = "AAseq"
        else:
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)
Ejemplo n.º 2
0
    def _write_seq(self, record):
        """Write the sequence.

        Note that SeqXML requires a DNA, RNA or protein alphabet.
        """

        if isinstance(record.seq, UnknownSeq):
            raise TypeError(
                "Sequence type is UnknownSeq but SeqXML requires sequence")

        seq = str(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alpha, Alphabet.RNAAlphabet):
            seqElem = "RNAseq"
        elif isinstance(alpha, Alphabet.DNAAlphabet):
            seqElem = "DNAseq"
        elif isinstance(alpha, Alphabet.ProteinAlphabet):
            seqElem = "AAseq"
        else:
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)
Ejemplo n.º 3
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence.

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        #Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        #Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            #Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError("Alignment contains a sequence with \
                                an incompatible alphabet.")

        #Check the ambiguous character we are going to use in the consensus
        #is in the alphabet's list of valid letters (if defined).
        if hasattr(a, "letters") and a.letters is not None \
        and ambiguous not in a.letters:
            #We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
Ejemplo n.º 4
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence.

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        #Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        #Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            #Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError("Alignment contains a sequence with \
                                an incompatible alphabet.")

        #Check the ambiguous character we are going to use in the consensus
        #is in the alphabet's list of valid letters (if defined).
        if hasattr(a, "letters") and a.letters is not None \
        and ambiguous not in a.letters:
            #We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
Ejemplo n.º 5
0
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)

        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            return "protein"
        elif isinstance(a, Alphabet.DNAAlphabet):
            return "dna"
        elif isinstance(a, Alphabet.RNAAlphabet):
            return "rna"
        else:
            #Must be something like NucleotideAlphabet or
            #just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
Ejemplo n.º 6
0
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False,
                     monoisotopic=False):
    """Calculates the molecular weight of a DNA, RNA or protein sequence.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

    seq: String or Biopython sequence object.
    seq_type: The default (None) is to take the alphabet from the seq argument,
              or assume DNA if the seq argument is a string. Override this with
              a string 'DNA', 'RNA', or 'protein'.
    double_stranded: Calculate the mass for the double stranded molecule?
    circular: Is the molecule circular (has no ends)?
    monoisotopic: Use the monoisotopic mass tables?

    Note that for backwards compatibility, if the seq argument is a string,
    or Seq object with a generic alphabet, and no seq_type is specified
    (i.e. left as None), then DNA is assumed.

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    Or, with the sequence alphabet:

    >>> from SAP.Bio.Seq import Seq
    >>> from SAP.Bio.Alphabet import generic_dna, generic_rna, generic_protein
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna)))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna)))
    997.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein)))
    249.29

    Also note that contradictory sequence alphabets and seq_type will also
    give an exception:

    >>> from SAP.Bio.Seq import Seq
    >>> from SAP.Bio.Alphabet import generic_dna
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA"))
    Traceback (most recent call last):
      ...
    ValueError: seq_type='RNA' contradicts DNA from seq alphabet

    """
    # Rewritten by Markus Piotrowski, 2014
    
    # Find the alphabet type
    tmp_type = ''
    if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
        base_alphabet = Alphabet._get_base_alphabet(seq.alphabet)
        if isinstance(base_alphabet, Alphabet.DNAAlphabet):
            tmp_type = 'DNA'
        elif isinstance(base_alphabet, Alphabet.RNAAlphabet):
            tmp_type = 'RNA'
        elif isinstance(base_alphabet, Alphabet.ProteinAlphabet):
            tmp_type = 'protein'
        elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein):
            tmp_type = 'protein'
            # Convert to one-letter sequence. Have to use a string for seq1  
            seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet())
        elif not isinstance(base_alphabet, Alphabet.Alphabet):
            raise TypeError("%s is not a valid alphabet for mass calculations"
                             % base_alphabet)
        else:
            tmp_type = "DNA" # backward compatibity
        if seq_type and tmp_type and tmp_type != seq_type:
            raise ValueError("seq_type=%r contradicts %s from seq alphabet"
                             % (seq_type, tmp_type))
        seq_type = tmp_type
    elif isinstance(seq, str):
        if seq_type is None:
            seq_type = "DNA" # backward compatibity
    else:
        raise TypeError("Expected a string or Seq object, not seq=%r" % seq)

    seq = ''.join(str(seq).split()).upper() # Do the minimum formatting

    if seq_type == 'DNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
        else:
            weight_table = IUPACData.unambiguous_dna_weights
    elif seq_type == 'RNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
        else:
            weight_table = IUPACData.unambiguous_rna_weights
    elif seq_type == 'protein':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_protein_weights
        else:
            weight_table = IUPACData.protein_weights
    else:
        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r"
                         % seq_type)

    if monoisotopic:
        water = 18.010565
    else:
        water = 18.0153

    try:
        weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    except KeyError as e:
        raise ValueError('%s is not a valid unambiguous letter for %s'
                         %(e, seq_type))
    except:
        raise

    if seq_type in ('DNA', 'RNA') and double_stranded:
        seq = str(Seq(seq).complement())
        weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    elif seq_type == 'protein' and double_stranded:
        raise ValueError('double-stranded proteins await their discovery') 

    return weight
Ejemplo n.º 7
0
    def information_content(self, start = 0,
                            end = None,
                            e_freq_table = None, log_base = 2,
                            chars_to_ignore = []):
        """Calculate the information content for each residue along an alignment.

        Arguments:
        o start, end - The starting an ending points to calculate the
        information content. These points should be relative to the first
        sequence in the alignment, starting at zero (ie. even if the 'real'
        first position in the seq is 203 in the initial sequence, for
        the info content, we need to use zero). This defaults to the entire
        length of the first sequence.
        o e_freq_table - A FreqTable object specifying the expected frequencies
        for each letter in the alphabet we are using (e.g. {'G' : 0.4,
        'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
        included, since these should not have expected frequencies.
        o log_base - The base of the logathrim to use in calculating the
        information content. This defaults to 2 so the info is in bits.
        o chars_to_ignore - A listing of characterw which should be ignored
        in calculating the info content.

        Returns:
        o A number representing the info content for the specified region.

        Please see the Biopython manual for more information on how information
        content is calculated.
        """
        # if no end was specified, then we default to the end of the sequence
        if end is None:
            end = len(self.alignment._records[0].seq)

        if start < 0 or end > len(self.alignment._records[0].seq):
            raise ValueError("Start (%s) and end (%s) are not in the \
                    range %s to %s"
                    % (start, end, 0, len(self.alignment._records[0].seq)))
        # determine random expected frequencies, if necessary
        random_expected = None
        if not e_freq_table:
            #TODO - What about ambiguous alphabets?
            base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
            if isinstance(base_alpha, Alphabet.ProteinAlphabet):
                random_expected = Protein20Random
            elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
                random_expected = Nucleotide4Random
            else:
                errstr = "Error in alphabet: not Nucleotide or Protein, "
                errstr += "supply expected frequencies"
                raise ValueError(errstr)
            del base_alpha
        elif not isinstance(e_freq_table, FreqTable.FreqTable):
            raise ValueError("e_freq_table should be a FreqTable object")

        # determine all of the letters we have to deal with
        all_letters = self._get_all_letters()
        for char in chars_to_ignore:
            all_letters = all_letters.replace(char, '')

        info_content = {}
        for residue_num in range(start, end):
            freq_dict = self._get_letter_freqs(residue_num,
                                               self.alignment._records,
                                               all_letters, chars_to_ignore)
            # print freq_dict,
            column_score = self._get_column_info_content(freq_dict,
                                                         e_freq_table,
                                                         log_base,
                                                         random_expected)

            info_content[residue_num] = column_score
        # sum up the score
        total_info = sum(info_content.values())
        # fill in the ic_vector member: holds IC for each column
        for i in info_content:
            self.ic_vector[i] = info_content[i]
        return total_info
Ejemplo n.º 8
0
    def information_content(self,
                            start=0,
                            end=None,
                            e_freq_table=None,
                            log_base=2,
                            chars_to_ignore=[]):
        """Calculate the information content for each residue along an alignment.

        Arguments:
        o start, end - The starting an ending points to calculate the
        information content. These points should be relative to the first
        sequence in the alignment, starting at zero (ie. even if the 'real'
        first position in the seq is 203 in the initial sequence, for
        the info content, we need to use zero). This defaults to the entire
        length of the first sequence.
        o e_freq_table - A FreqTable object specifying the expected frequencies
        for each letter in the alphabet we are using (e.g. {'G' : 0.4,
        'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
        included, since these should not have expected frequencies.
        o log_base - The base of the logathrim to use in calculating the
        information content. This defaults to 2 so the info is in bits.
        o chars_to_ignore - A listing of characterw which should be ignored
        in calculating the info content.

        Returns:
        o A number representing the info content for the specified region.

        Please see the Biopython manual for more information on how information
        content is calculated.
        """
        # if no end was specified, then we default to the end of the sequence
        if end is None:
            end = len(self.alignment._records[0].seq)

        if start < 0 or end > len(self.alignment._records[0].seq):
            raise ValueError(
                "Start (%s) and end (%s) are not in the \
                    range %s to %s" %
                (start, end, 0, len(self.alignment._records[0].seq)))
        # determine random expected frequencies, if necessary
        random_expected = None
        if not e_freq_table:
            #TODO - What about ambiguous alphabets?
            base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
            if isinstance(base_alpha, Alphabet.ProteinAlphabet):
                random_expected = Protein20Random
            elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
                random_expected = Nucleotide4Random
            else:
                errstr = "Error in alphabet: not Nucleotide or Protein, "
                errstr += "supply expected frequencies"
                raise ValueError(errstr)
            del base_alpha
        elif not isinstance(e_freq_table, FreqTable.FreqTable):
            raise ValueError("e_freq_table should be a FreqTable object")

        # determine all of the letters we have to deal with
        all_letters = self._get_all_letters()
        for char in chars_to_ignore:
            all_letters = all_letters.replace(char, '')

        info_content = {}
        for residue_num in range(start, end):
            freq_dict = self._get_letter_freqs(residue_num,
                                               self.alignment._records,
                                               all_letters, chars_to_ignore)
            # print freq_dict,
            column_score = self._get_column_info_content(
                freq_dict, e_freq_table, log_base, random_expected)

            info_content[residue_num] = column_score
        # sum up the score
        total_info = sum(info_content.values())
        # fill in the ic_vector member: holds IC for each column
        for i in info_content:
            self.ic_vector[i] = info_content[i]
        return total_info
Ejemplo n.º 9
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, "mode"):
        if set("rb") != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes("ABIF"):
        raise IOError("File should start ABIF, not %r" % marker)

    # dirty hack for handling time information
    times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == "PBAS2":
            seq = tag_data
            ambigs = "KYWMRS"
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == "PCON2":
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == "SMPL1":
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
    annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace(".ab1", "")
    except:
        file_name = ""

    record = SeqRecord(
        Seq(seq, alphabet),
        id=sample_id,
        name=file_name,
        description="",
        annotations=annot,
        letter_annotations={"phred_quality": qual},
    )

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
Ejemplo n.º 10
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id, name=file_name,
                       description='',
                       annotations=annot,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)