コード例 #1
0
    def _write_seq(self, record):
        """Write the sequence.

        Note that SeqXML requires a DNA, RNA or protein alphabet.
        """

        if isinstance(record.seq, UnknownSeq):
            raise TypeError(
                "Sequence type is UnknownSeq but SeqXML requires sequence")

        seq = str(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alpha, Alphabet.RNAAlphabet):
            seqElem = "RNAseq"
        elif isinstance(alpha, Alphabet.DNAAlphabet):
            seqElem = "DNAseq"
        elif isinstance(alpha, Alphabet.ProteinAlphabet):
            seqElem = "AAseq"
        else:
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)
コード例 #2
0
ファイル: SeqXmlIO.py プロジェクト: bow/biopython
    def _write_seq(self, record):
        """Write the sequence.

        Note that SeqXML requires a DNA, RNA or protein alphabet.
        """

        if isinstance(record.seq, UnknownSeq):
            raise TypeError("Sequence type is UnknownSeq but SeqXML requires sequence")

        seq = str(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alpha, Alphabet.RNAAlphabet):
            seqElem = "RNAseq"
        elif isinstance(alpha, Alphabet.DNAAlphabet):
            seqElem = "DNAseq"
        elif isinstance(alpha, Alphabet.ProteinAlphabet):
            seqElem = "AAseq"
        else:
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)
コード例 #3
0
ファイル: NexusIO.py プロジェクト: Ambuj-UF/ConCat-1.0
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)
        """condition loop below was edited by Ambuj Kumar in order to make 
            it align with ConCat"""

        if 'Alphabet.Alphabet' not in str(
                type(a)) and 'Alphabet.ProteinAlphabet' not in str(
                    type(a)) and 'Alphabet.DNAAlphabet' not in str(
                        type(a)) and 'Alphabet.RNAAlphabet' not in str(
                            type(a)) and 'Alphabet.Gapped' not in str(type(a)):
            raise TypeError("Invalid alphabet")
        elif 'Protein' in str(type(a)):
            return "protein"
        elif 'DNA' in str(type(a)):
            return "dna"
        elif 'RNA' in str(type(a)):
            return "rna"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
コード例 #4
0
ファイル: InsdcIO.py プロジェクト: GJOHNSON2003/biopython
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
            version = "SV " + record.id.rsplit(".", 1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".", 1)[0],
                                                 just_first=True)
        else:
            version = ""
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)

        if ";" in accession:
            raise ValueError("Cannot have semi-colon in EMBL accession, %s"
                             % repr(str(accession)))
        if " " in accession:
            # This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s"
                             % repr(str(accession)))

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
            units = "BP"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
            units = "BP"
        elif isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = "PROTEIN"
            units = "AA"
        else:
            # Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        # Get the taxonomy division
        division = self._get_data_division(record)

        # TODO - Full ID line
        handle = self.handle
        # ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
        # 1. Primary accession number
        # 2. Sequence version number
        # 3. Topology: 'circular' or 'linear'
        # 4. Molecule type
        # 5. Data class
        # 6. Taxonomic division
        # 7. Sequence length
        self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s."
                                % (accession, version, mol_type,
                                   division, len(record), units))
        handle.write("XX\n")
        self._write_single_line("AC", accession + ";")
        handle.write("XX\n")
コード例 #5
0
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
            version = "SV " + record.id.rsplit(".", 1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".", 1)[0],
                                                 just_first=True)
        else :
            version = ""
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)
        
        if ";" in accession :
            raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
                             % repr(str(accession)))
        if " " in accession :
            #This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s" \
                             % repr(str(accession)))

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
            units = "BP"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
            units = "BP"
        elif isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = "PROTEIN"
            units = "AA"
        else:
            #Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        #Get the taxonomy division
        division = self._get_data_division(record)

        #TODO - Full ID line
        handle = self.handle
        #ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
        #1. Primary accession number
        #2. Sequence version number
        #3. Topology: 'circular' or 'linear'
        #4. Molecule type
        #5. Data class
        #6. Taxonomic division
        #7. Sequence length
        self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." \
                                % (accession, version, mol_type,
                                   division, len(record), units))
        handle.write("XX\n")
        self._write_single_line("AC", accession+";")
        handle.write("XX\n")
コード例 #6
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE).

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        # Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        # Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            # Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError(
                    "Alignment contains a sequence with an incompatible alphabet."
                )

        # Check the ambiguous character we are going to use in the consensus
        # is in the alphabet's list of valid letters (if defined).
        if (
            hasattr(a, "letters")
            and a.letters is not None
            and ambiguous not in a.letters
        ):
            # We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
コード例 #7
0
ファイル: InsdcIO.py プロジェクト: wxb263stu/parliament2
    def _write_sequence(self, record):
        LETTERS_PER_BLOCK = 10
        BLOCKS_PER_LINE = 6
        LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE
        POSITION_PADDING = 10
        handle = self.handle  # save looking up this multiple times

        if isinstance(record.seq, UnknownSeq):
            # We have already recorded the length, and there is no need
            # to record a long sequence of NNNNNNN...NNN or whatever.
            if "contig" in record.annotations:
                self._write_contig(record)
            else:
                # TODO - Can the sequence just be left out as in GenBank files?
                handle.write("SQ   \n")
            return

        # Catches sequence being None
        data = self._get_seq_string(record).lower()
        seq_len = len(data)

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(a, Alphabet.DNAAlphabet):
            # TODO - What if we have RNA?
            a_count = data.count('A') + data.count('a')
            c_count = data.count('C') + data.count('c')
            g_count = data.count('G') + data.count('g')
            t_count = data.count('T') + data.count('t')
            other = seq_len - (a_count + c_count + g_count + t_count)
            handle.write(
                "SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" %
                (seq_len, a_count, c_count, g_count, t_count, other))
        else:
            handle.write("SQ   \n")

        for line_number in range(0, seq_len // LETTERS_PER_LINE):
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK]))
            handle.write(
                str((line_number + 1) *
                    LETTERS_PER_LINE).rjust(POSITION_PADDING))
            handle.write("\n")
        if seq_len % LETTERS_PER_LINE:
            # Final (partial) line
            line_number = (seq_len // LETTERS_PER_LINE)
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write(
                    (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11))
            handle.write(str(seq_len).rjust(POSITION_PADDING))
            handle.write("\n")
コード例 #8
0
ファイル: InsdcIO.py プロジェクト: GJOHNSON2003/biopython
    def _write_sequence(self, record):
        LETTERS_PER_BLOCK = 10
        BLOCKS_PER_LINE = 6
        LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE
        POSITION_PADDING = 10
        handle = self.handle  # save looking up this multiple times

        if isinstance(record.seq, UnknownSeq):
            # We have already recorded the length, and there is no need
            # to record a long sequence of NNNNNNN...NNN or whatever.
            if "contig" in record.annotations:
                self._write_contig(record)
            else:
                # TODO - Can the sequence just be left out as in GenBank files?
                handle.write("SQ   \n")
            return

        # Catches sequence being None
        data = self._get_seq_string(record).lower()
        seq_len = len(data)

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(a, Alphabet.DNAAlphabet):
            # TODO - What if we have RNA?
            a_count = data.count('A') + data.count('a')
            c_count = data.count('C') + data.count('c')
            g_count = data.count('G') + data.count('g')
            t_count = data.count('T') + data.count('t')
            other = seq_len - (a_count + c_count + g_count + t_count)
            handle.write("SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n"
                         % (seq_len, a_count, c_count, g_count, t_count, other))
        else:
            handle.write("SQ   \n")

        for line_number in range(0, seq_len // LETTERS_PER_LINE):
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK]))
            handle.write(str((line_number + 1) *
                             LETTERS_PER_LINE).rjust(POSITION_PADDING))
            handle.write("\n")
        if seq_len % LETTERS_PER_LINE:
            # Final (partial) line
            line_number = (seq_len // LETTERS_PER_LINE)
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write(
                    (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11))
            handle.write(str(seq_len).rjust(POSITION_PADDING))
            handle.write("\n")
コード例 #9
0
ファイル: AlignInfo.py プロジェクト: pvanheus/biopython
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence.

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        # Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        # Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            # Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError(
                    "Alignment contains a sequence with \
                                an incompatible alphabet."
                )

        # Check the ambiguous character we are going to use in the consensus
        # is in the alphabet's list of valid letters (if defined).
        if hasattr(a, "letters") and a.letters is not None and ambiguous not in a.letters:
            # We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
コード例 #10
0
ファイル: InsdcIO.py プロジェクト: apierleoni/biopython
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".",1)[1].isdigit():
            version = "SV " + record.id.rsplit(".",1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".",1)[0],
                                                 just_first=True)
        else :
            version = ""
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)
        
        if ";" in accession :
            raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
                             % repr(accession))
        if " " in accession :
            #This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s" \
                             % repr(accession))

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif not isinstance(a, Alphabet.NucleotideAlphabet):
            raise ValueError("Need a Nucleotide alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            #Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA or RNA alphabet")

        #TODO - Full ID line
        handle = self.handle
        self._write_single_line("ID", "%s; %s; ; %s; ; ; %i BP." \
                                % (accession, version, mol_type, len(record)))
        handle.write("XX\n")
        self._write_single_line("AC", accession+";")
        handle.write("XX\n")
コード例 #11
0
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)

        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            return "protein"
        elif isinstance(a, Alphabet.DNAAlphabet):
            return "dna"
        elif isinstance(a, Alphabet.RNAAlphabet):
            return "rna"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
コード例 #12
0
ファイル: NexusIO.py プロジェクト: DunbrackLab/biopython
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)

        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            return "protein"
        elif isinstance(a, Alphabet.DNAAlphabet):
            return "dna"
        elif isinstance(a, Alphabet.RNAAlphabet):
            return "rna"
        else:
            #Must be something like NucleotideAlphabet or
            #just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
コード例 #13
0
ファイル: InsdcIO.py プロジェクト: GJOHNSON2003/biopython
 def _write_references(self, record):
     number = 0
     for ref in record.annotations["references"]:
         if not isinstance(ref, SeqFeature.Reference):
             continue
         number += 1
         data = str(number)
         # TODO - support more complex record reference locations?
         if ref.location and len(ref.location) == 1:
             a = Alphabet._get_base_alphabet(record.seq.alphabet)
             if isinstance(a, Alphabet.ProteinAlphabet):
                 units = "residues"
             else:
                 units = "bases"
             data += "  (%s %i to %i)" % (units,
                                          ref.location[0].nofuzzy_start + 1,
                                          ref.location[0].nofuzzy_end)
         self._write_single_line("REFERENCE", data)
         if ref.authors:
             # We store the AUTHORS data as a single string
             self._write_multi_line("  AUTHORS", ref.authors)
         if ref.consrtm:
             # We store the consortium as a single string
             self._write_multi_line("  CONSRTM", ref.consrtm)
         if ref.title:
             # We store the title as a single string
             self._write_multi_line("  TITLE", ref.title)
         if ref.journal:
             # We store this as a single string - holds the journal name,
             # volume, year, and page numbers of the citation
             self._write_multi_line("  JOURNAL", ref.journal)
         if ref.medline_id:
             # This line type is obsolete and was removed from the GenBank
             # flatfile format in April 2005. Should we write it?
             # Note this has a two space indent:
             self._write_multi_line("  MEDLINE", ref.medline_id)
         if ref.pubmed_id:
             # Note this has a THREE space indent:
             self._write_multi_line("   PUBMED", ref.pubmed_id)
         if ref.comment:
             self._write_multi_line("  REMARK", ref.comment)
コード例 #14
0
ファイル: InsdcIO.py プロジェクト: wxb263stu/parliament2
 def _write_references(self, record):
     number = 0
     for ref in record.annotations["references"]:
         if not isinstance(ref, SeqFeature.Reference):
             continue
         number += 1
         data = str(number)
         # TODO - support more complex record reference locations?
         if ref.location and len(ref.location) == 1:
             a = Alphabet._get_base_alphabet(record.seq.alphabet)
             if isinstance(a, Alphabet.ProteinAlphabet):
                 units = "residues"
             else:
                 units = "bases"
             data += "  (%s %i to %i)" % (units,
                                          ref.location[0].nofuzzy_start + 1,
                                          ref.location[0].nofuzzy_end)
         self._write_single_line("REFERENCE", data)
         if ref.authors:
             # We store the AUTHORS data as a single string
             self._write_multi_line("  AUTHORS", ref.authors)
         if ref.consrtm:
             # We store the consortium as a single string
             self._write_multi_line("  CONSRTM", ref.consrtm)
         if ref.title:
             # We store the title as a single string
             self._write_multi_line("  TITLE", ref.title)
         if ref.journal:
             # We store this as a single string - holds the journal name,
             # volume, year, and page numbers of the citation
             self._write_multi_line("  JOURNAL", ref.journal)
         if ref.medline_id:
             # This line type is obsolete and was removed from the GenBank
             # flatfile format in April 2005. Should we write it?
             # Note this has a two space indent:
             self._write_multi_line("  MEDLINE", ref.medline_id)
         if ref.pubmed_id:
             # Note this has a THREE space indent:
             self._write_multi_line("   PUBMED", ref.pubmed_id)
         if ref.comment:
             self._write_multi_line("  REMARK", ref.comment)
コード例 #15
0
ファイル: NexusIO.py プロジェクト: Ambuj-UF/ConCat-1.0
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)


        """condition loop below was edited by Ambuj Kumar in order to make 
            it align with ConCat"""


        if 'Alphabet.Alphabet' not in str(type(a)) and 'Alphabet.ProteinAlphabet' not in str(type(a)) and 'Alphabet.DNAAlphabet' not in str(type(a)) and 'Alphabet.RNAAlphabet' not in str(type(a)) and 'Alphabet.Gapped' not in str(type(a)):
            raise TypeError("Invalid alphabet")
        elif 'Protein' in str(type(a)):
            return "protein"
        elif 'DNA' in str(type(a)):
            return "dna"
        elif 'RNA' in str(type(a)):
            return "rna"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
コード例 #16
0
    def information_content(self, start=0,
                            end=None,
                            e_freq_table=None, log_base=2,
                            chars_to_ignore=None, pseudo_count=0):
        """Calculate the information content for each residue along an alignment.

        Arguments:
         - start, end - The starting an ending points to calculate the
           information content. These points should be relative to the first
           sequence in the alignment, starting at zero (ie. even if the 'real'
           first position in the seq is 203 in the initial sequence, for
           the info content, we need to use zero). This defaults to the entire
           length of the first sequence.
         - e_freq_table - A FreqTable object specifying the expected frequencies
           for each letter in the alphabet we are using (e.g. {'G' : 0.4,
           'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
           included, since these should not have expected frequencies.
         - log_base - The base of the logathrim to use in calculating the
           information content. This defaults to 2 so the info is in bits.
         - chars_to_ignore - A listing of characters which should be ignored
           in calculating the info content. Defaults to none.

        Returns:
         - A number representing the info content for the specified region.

        Please see the Biopython manual for more information on how information
        content is calculated.

        """
        # if no end was specified, then we default to the end of the sequence
        if end is None:
            end = len(self.alignment[0].seq)
        if chars_to_ignore is None:
            chars_to_ignore = []

        if start < 0 or end > len(self.alignment[0].seq):
            raise ValueError("Start (%s) and end (%s) are not in the \
                    range %s to %s"
                             % (start, end, 0, len(self.alignment[0].seq)))
        # determine random expected frequencies, if necessary
        random_expected = None
        if not e_freq_table:
            # TODO - What about ambiguous alphabets?
            base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
            if isinstance(base_alpha, Alphabet.ProteinAlphabet):
                random_expected = Protein20Random
            elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
                random_expected = Nucleotide4Random
            else:
                errstr = "Error in alphabet: not Nucleotide or Protein, "
                errstr += "supply expected frequencies"
                raise ValueError(errstr)
            del base_alpha
        elif not isinstance(e_freq_table, FreqTable.FreqTable):
            raise ValueError("e_freq_table should be a FreqTable object")

        # determine all of the letters we have to deal with
        all_letters = self._get_all_letters()
        for char in chars_to_ignore:
            all_letters = all_letters.replace(char, '')

        info_content = {}
        for residue_num in range(start, end):
            freq_dict = self._get_letter_freqs(residue_num,
                                               self.alignment,
                                               all_letters,
                                               chars_to_ignore,
                                               pseudo_count,
                                               e_freq_table,
                                               random_expected)
            # print freq_dict,
            column_score = self._get_column_info_content(freq_dict,
                                                         e_freq_table,
                                                         log_base,
                                                         random_expected)
            info_content[residue_num] = column_score
        # sum up the score
        total_info = sum(info_content.values())
        # fill in the ic_vector member: holds IC for each column
        # reset ic_vector to empty list at each call
        self.ic_vector = []
        for (i, k) in enumerate(info_content):
            self.ic_vector.append(info_content[i + start])
        return total_info
コード例 #17
0
ファイル: test_SeqIO.py プロジェクト: Pfiver/RNA-Seqlyze
    # Check Bio.SeqIO.read(...)
    if t_count == 1:
        record = SeqIO.read(handle=open(t_filename, mode), format=t_format)
        assert isinstance(record, SeqRecord)
    else:
        try:
            record = SeqIO.read(open(t_filename), t_format)
            assert False, "Bio.SeqIO.read(...) should have failed"
        except ValueError:
            #Expected to fail
            pass

    # Check alphabets
    for record in records:
        base_alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(base_alpha, Alphabet.SingleLetterAlphabet):
            if t_format in no_alpha_formats:
                assert base_alpha == Alphabet.single_letter_alphabet  # Too harsh?
        else:
            base_alpha = None
    if base_alpha is None:
        good = []
        bad = []
        given_alpha = None
    elif isinstance(base_alpha, Alphabet.ProteinAlphabet):
        good = protein_alphas
        bad = dna_alphas + rna_alphas + nucleotide_alphas
    elif isinstance(base_alpha, Alphabet.RNAAlphabet):
        good = nucleotide_alphas + rna_alphas
        bad = protein_alphas + dna_alphas
コード例 #18
0
ファイル: InsdcIO.py プロジェクト: NirBenTalLab/find_motif
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(record,
                                             "accession",
                                             just_first=True)
        if len(locus) > 16:
            raise ValueError("Locus identifier %s is too long" % repr(locus))

        if len(record) > 99999999999:
            #Currently GenBank only officially support up to 350000, but
            #the length field can take eleven digits
            raise ValueError("Sequence too long!")

        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "aa"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "bp"
        else:
            #Must be something like NucleotideAlphabet or
            #just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            #Must be something like NucleotideAlphabet or
            #just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        try:
            division = record.annotations["data_file_division"]
        except KeyError:
            division = "UNK"
        if division not in [
                "PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "VRL", "PHG",
                "SYN", "UNA", "EST", "PAT", "STS", "GSS", "HTG", "HTC", "ENV",
                "CON"
        ]:
            division = "UNK"

        assert len(units) == 2
        assert len(division) == 3
        #TODO - date
        #TODO - mol_type
        line = "LOCUS       %s %s %s    %s           %s %s\n" \
                     % (locus.ljust(16),
                        str(len(record)).rjust(11),
                        units,
                        mol_type.ljust(6),
                        division,
                        self._get_date(record))
        assert len(line) == 79 + 1, repr(line)  #plus one for new line

        assert line[12:28].rstrip() == locus, \
               'LOCUS line does not contain the locus at the expected position:\n' + line
        assert line[28:29] == " "
        assert line[29:40].lstrip() == str(len(record)), \
               'LOCUS line does not contain the length at the expected position:\n' + line

        #Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [' bp ', ' aa '] , \
               'LOCUS line does not contain size units at expected position:\n' + line
        assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
               'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
        assert line[47:54].strip() == "" \
        or line[47:54].strip().find('DNA') != -1 \
        or line[47:54].strip().find('RNA') != -1, \
               'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
        assert line[54:55] == ' ', \
               'LOCUS line does not contain space at position 55:\n' + line
        assert line[55:63].strip() in ['','linear','circular'], \
               'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
        assert line[63:64] == ' ', \
               'LOCUS line does not contain space at position 64:\n' + line
        assert line[67:68] == ' ', \
               'LOCUS line does not contain space at position 68:\n' + line
        assert line[70:71] == '-', \
               'LOCUS line does not contain - at position 71 in date:\n' + line
        assert line[74:75] == '-', \
               'LOCUS line does not contain - at position 75 in date:\n' + line

        self.handle.write(line)
コード例 #19
0
ファイル: parser.py プロジェクト: yech1990/cfutils
def abi_iterator(handle, alphabet=None):
    """Iterator for the Abi file format."""
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(
            Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet
        ):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins."
            )
        if isinstance(
            Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet
        ):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, "mode"):
        if set("rb") != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != b"ABIF":
        raise IOError("File should start ABIF, not %r" % marker)

    # dirty hack for handling time information
    times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == "PBAS2":
            seq = tag_data
            ambigs = "KYWMRS"
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == "PCON2":
            qual = [ord(val) for val in tag_data]
        # PLOC2 is the location of peaks
        elif key == "PLOC2":
            peakamps = [float(val) for val in tag_data]
            annot["peak positions"] = peakamps
        # DATA1-DATA4 is raw channel 1-4 output, DATA9-12 the analyzed one
        elif key in ["DATA9", "DATA10", "DATA11", "DATA12"]:
            rawch = [float(val) for val in tag_data]
            annot["channel " + str(int(key[4:]) - 8)] = rawch
        # FWO_1 is the order of channels in bases
        elif key == "FWO_1":
            channelorders = tag_data
            annot["channels"] = channelorders
        # SMPL1 is sample id entered before sequencing run
        elif key == "SMPL1":
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
    annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])

    # use the file name as SeqRecord.name if available
    #  try:
    file_name = basename(handle.name).replace(".ab1", "")
    #  except:
    #  file_name = ""

    record = SeqRecord(
        Seq(seq, alphabet),
        id=sample_id,
        name=file_name,
        description="",
        annotations=annot,
        letter_annotations={"phred_quality": qual},
    )

    #  yield _abi_trim(record)
    yield record
コード例 #20
0
ファイル: AbiIO.py プロジェクト: tulw4r/biopython
    def iterate(self, handle):
        """Parse the file and generate SeqRecord objects."""
        alphabet = self.alphabet
        # raise exception if alphabet is not dna
        if alphabet is not None:
            if isinstance(Alphabet._get_base_alphabet(alphabet),
                          Alphabet.ProteinAlphabet):
                raise ValueError(
                    "Invalid alphabet, ABI files do not hold proteins.")
            if isinstance(Alphabet._get_base_alphabet(alphabet),
                          Alphabet.RNAAlphabet):
                raise ValueError(
                    "Invalid alphabet, ABI files do not hold RNA.")
        # dirty hack for handling time information
        times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}

        # initialize annotations
        annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

        # parse header and extract data from directories
        header = struct.unpack(_HEADFMT,
                               handle.read(struct.calcsize(_HEADFMT)))

        # Set default sample ID value, which we expect to be present in most
        # cases in the SMPL1 tag, but may be missing.
        sample_id = "<unknown id>"

        raw = {}
        for tag_name, tag_number, tag_data in _abi_parse_header(
                header, handle):
            key = tag_name + str(tag_number)

            raw[key] = tag_data

            # PBAS2 is base-called sequence, only available in 3530
            if key == "PBAS2":
                seq = tag_data.decode()
                ambigs = "KYWMRS"
                if alphabet is None:
                    if set(seq).intersection(ambigs):
                        alphabet = ambiguous_dna
                    else:
                        alphabet = unambiguous_dna
            # PCON2 is quality values of base-called sequence
            elif key == "PCON2":
                qual = [ord(val) for val in tag_data.decode()]
            # SMPL1 is sample id entered before sequencing run, it must be
            # a string.
            elif key == "SMPL1":
                sample_id = _get_string_tag(tag_data)
            elif key in times:
                times[key] = tag_data
            else:
                if key in _EXTRACT:
                    annot[_EXTRACT[key]] = tag_data

        # set time annotations
        annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
        annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])

        # raw data (for advanced end users benefit)
        annot["abif_raw"] = raw

        # fsa check
        is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2"))

        if is_fsa_file:
            try:
                file_name = basename(handle.name).replace(".fsa", "")
            except AttributeError:
                file_name = ""

            sample_id = _get_string_tag(raw.get("LIMS1"), sample_id)
            description = _get_string_tag(raw.get("CTID1"),
                                          "<unknown description>")
            record = SeqRecord(
                Seq(""),
                id=sample_id,
                name=file_name,
                description=description,
                annotations=annot,
            )

        else:
            # use the file name as SeqRecord.name if available
            try:
                file_name = basename(handle.name).replace(".ab1", "")
            except AttributeError:
                file_name = ""
            record = SeqRecord(
                Seq(seq, alphabet),
                id=sample_id,
                name=file_name,
                description="",
                annotations=annot,
                letter_annotations={"phred_quality": qual},
            )

        if self.trim and not is_fsa_file:
            record = _abi_trim(record)

        record.annotations["molecule_type"] = "DNA"
        yield record
コード例 #21
0
ファイル: AbiIO.py プロジェクト: xirect/textgraver
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != b"ABIF":
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {
        'RUND1': '',
        'RUND2': '',
        'RUNT1': '',
        'RUNT2': '',
    }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    raw = dict()
    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        key = tag_name + str(tag_number)

        raw[key] = tag_data

        # PBAS2 is base-called sequence, only available in 3530
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # raw data (for advanced end users benefit)
    annot['abif_raw'] = raw

    # fsa check
    is_fsa_file = set(['SpNm1', 'LIMS1', 'CTID1']).issubset(raw)

    if is_fsa_file:
        try:
            file_name = basename(handle.name).replace('.fsa', '')
        except AttributeError:
            file_name = ""
        sample_id = raw['LIMS1']
        description = raw['CTID1']
        record = SeqRecord(Seq(''),
                           id=sample_id,
                           name=file_name,
                           description=description,
                           annotations=annot)

    else:
        # use the file name as SeqRecord.name if available
        try:
            file_name = basename(handle.name).replace('.ab1', '')
        except AttributeError:
            file_name = ""
        record = SeqRecord(Seq(seq, alphabet),
                           id=sample_id,
                           name=file_name,
                           description='',
                           annotations=annot,
                           letter_annotations={'phred_quality': qual})

    if not trim or is_fsa_file:
        yield record
    else:
        yield _abi_trim(record)
コード例 #22
0
ファイル: AbiIO.py プロジェクト: ezequieljsosa/biopython
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format."""
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        return
    if marker != b"ABIF":
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    raw = dict()
    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        key = tag_name + str(tag_number)

        raw[key] = tag_data

        # PBAS2 is base-called sequence, only available in 3530
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # raw data (for advanced end users benefit)
    annot['abif_raw'] = raw

    # fsa check
    is_fsa_file = all([tn not in raw for tn in ('PBAS1', 'PBAS2')])

    if is_fsa_file:
        try:
            file_name = basename(handle.name).replace('.fsa', '')
        except AttributeError:
            file_name = ""
        sample_id = raw.get('LIMS1', '<unknown id>')
        description = raw.get('CTID1', '<unknown description>')
        record = SeqRecord(Seq(''),
                           id=sample_id,
                           name=file_name,
                           description=description,
                           annotations=annot)

    else:
        # use the file name as SeqRecord.name if available
        try:
            file_name = basename(handle.name).replace('.ab1', '')
        except AttributeError:
            file_name = ""
        record = SeqRecord(Seq(seq, alphabet),
                           id=sample_id, name=file_name,
                           description='',
                           annotations=annot,
                           letter_annotations={'phred_quality': qual})

    if not trim or is_fsa_file:
        yield record
    else:
        yield _abi_trim(record)
コード例 #23
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {
        'RUND1': '',
        'RUND2': '',
        'RUNT1': '',
        'RUNT2': '',
    }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # DATA9 is first processed trace data set
        elif key == 'DATA9':
            data9 = tag_data
        # DATA10 is second processed trace data set
        elif key == 'DATA10':
            data10 = tag_data
        # DATA11 is third processed trace data set
        elif key == 'DATA11':
            data11 = tag_data
        # DATA12 is fourth processed trace data set
        elif key == 'DATA12':
            data12 = tag_data
        # PLOC1 is location for each base call position
        elif key == 'PLOC1':
            ploc1 = tag_data
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id,
                       name=file_name,
                       description='',
                       annotations=annot,
                       data1=data9,
                       data2=data10,
                       data3=data11,
                       data4=data12,
                       pos=ploc1,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
コード例 #24
0
ファイル: XdnaIO.py プロジェクト: wenh06/biopython
    def write_file(self, records):
        """Write the specified record to a Xdna file.

        Note that the function expects a list (or iterable) of records
        as per the SequenceWriter interface, but the list should contain
        only one record as the Xdna format is a mono-record format.
        """
        records = iter(records)

        try:
            record = next(records)
        except StopIteration:
            raise ValueError("Must have one sequence")

        try:
            next(records)
            raise ValueError("More than one sequence found")
        except StopIteration:
            pass

        self._has_truncated_strings = False

        alptype = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alptype, Alphabet.DNAAlphabet):
            seqtype = 1
        elif isinstance(alptype, Alphabet.RNAAlphabet):
            seqtype = 3
        elif isinstance(alptype, Alphabet.ProteinAlphabet):
            seqtype = 4
        else:
            seqtype = 0

        if record.annotations.get("topology", "linear") == "circular":
            topology = 1
        else:
            topology = 0

        # We store the record's id and description in the comment field.
        # Make sure to avoid duplicating the id if it is already
        # contained in the description.
        if record.description.startswith(record.id):
            comment = record.description
        else:
            comment = f"{record.id} {record.description}"

        # Write header
        self.handle.write(
            pack(
                ">BBB25xII60xI11xB",
                0,  # version
                seqtype,
                topology,
                len(record),
                0,  # negative length
                len(comment),
                255,  # end of header
            ))

        # Actual sequence and comment
        self.handle.write(str(record.seq).encode("ASCII"))
        self.handle.write(comment.encode("ASCII"))

        self.handle.write(pack(">B", 0))  # Annotation section marker
        self._write_pstring("0")  # right-side overhang
        self._write_pstring("0")  # left-side overhand

        # Write features
        # We must skip features with fuzzy locations as they cannot be
        # represented in the Xdna format
        features = [
            f for f in record.features
            if type(f.location.start) == ExactPosition
            and type(f.location.end) == ExactPosition
        ]
        drop = len(record.features) - len(features)
        if drop > 0:
            warnings.warn(f"Dropping {drop} features with fuzzy locations",
                          BiopythonWarning)

        # We also cannot store more than 255 features as the number of
        # features is stored on a single byte...
        if len(features) > 255:
            drop = len(features) - 255
            warnings.warn(f"Too many features, dropping the last {drop}",
                          BiopythonWarning)
            features = features[:255]

        self.handle.write(pack(">B", len(features)))
        for feature in features:
            self._write_pstring(feature.qualifiers.get("label", [""])[0])

            description = ""
            for qname in feature.qualifiers:
                if qname in ("label", "translation"):
                    continue

                for val in feature.qualifiers[qname]:
                    if len(description) > 0:
                        description = description + "\x0D"
                    description = description + '%s="%s"' % (qname, val)
            self._write_pstring(description)

            self._write_pstring(feature.type)

            start = feature.location.start.position + 1  # 1-based coordinates
            end = feature.location.end.position
            strand = 1
            if feature.location.strand == -1:
                start, end = end, start
                strand = 0
            self._write_pstring(str(start))
            self._write_pstring(str(end))

            self.handle.write(pack(">BBBB", strand, 1, 0, 1))
            self._write_pstring("127,127,127")

        if self._has_truncated_strings:
            warnings.warn("Some annotations were truncated to 255 characters",
                          BiopythonWarning)

        return 1
コード例 #25
0
ファイル: AbiIO.py プロジェクト: g-zvi/biopython
def AbiIterator(handle, alphabet=None, trim=False):
    """Return an iterator for the Abi file format."""
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, "mode"):
        if set("rb") != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        return
    if marker != b"ABIF":
        raise IOError("File should start ABIF, not %r" % marker)

    # dirty hack for handling time information
    times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    # Set default sample ID value, which we expect to be present in most cases
    # in the SMPL1 tag, but may be missing.
    sample_id = "<unknown id>"

    raw = {}
    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        key = tag_name + str(tag_number)

        raw[key] = tag_data

        # PBAS2 is base-called sequence, only available in 3530
        if key == "PBAS2":
            seq = _bytes_to_string(tag_data)
            ambigs = "KYWMRS"
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == "PCON2":
            qual = [ord(val) for val in _bytes_to_string(tag_data)]
        # SMPL1 is sample id entered before sequencing run, it must be a string.
        elif key == "SMPL1":
            sample_id = _get_string_tag(tag_data)
        elif key in times:
            times[key] = tag_data
        else:
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
    annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])

    # raw data (for advanced end users benefit)
    annot["abif_raw"] = raw

    # fsa check
    is_fsa_file = all(tn not in raw for tn in ("PBAS1", "PBAS2"))

    if is_fsa_file:
        try:
            file_name = basename(handle.name).replace(".fsa", "")
        except AttributeError:
            file_name = ""

        sample_id = _get_string_tag(raw.get("LIMS1"), sample_id)
        description = _get_string_tag(raw.get("CTID1"), "<unknown description>")
        record = SeqRecord(
            Seq(""),
            id=sample_id,
            name=file_name,
            description=description,
            annotations=annot,
        )

    else:
        # use the file name as SeqRecord.name if available
        try:
            file_name = basename(handle.name).replace(".ab1", "")
        except AttributeError:
            file_name = ""
        record = SeqRecord(
            Seq(seq, alphabet),
            id=sample_id,
            name=file_name,
            description="",
            annotations=annot,
            letter_annotations={"phred_quality": qual},
        )

    if not trim or is_fsa_file:
        yield record
    else:
        yield _abi_trim(record)
コード例 #26
0
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False,
                     monoisotopic=False):
    """Calculates the molecular weight of a DNA, RNA or protein sequence.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

        - seq: String or Biopython sequence object.
        - seq_type: The default (None) is to take the alphabet from the seq argument,
          or assume DNA if the seq argument is a string. Override this with
          a string 'DNA', 'RNA', or 'protein'.
        - double_stranded: Calculate the mass for the double stranded molecule?
        - circular: Is the molecule circular (has no ends)?
        - monoisotopic: Use the monoisotopic mass tables?

    Note that for backwards compatibility, if the seq argument is a string,
    or Seq object with a generic alphabet, and no seq_type is specified
    (i.e. left as None), then DNA is assumed.

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    Or, with the sequence alphabet:

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna)))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna)))
    997.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein)))
    249.29

    Also note that contradictory sequence alphabets and seq_type will also
    give an exception:

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import generic_dna
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA"))
    Traceback (most recent call last):
      ...
    ValueError: seq_type='RNA' contradicts DNA from seq alphabet

    """
    # Rewritten by Markus Piotrowski, 2014

    # Find the alphabet type
    tmp_type = ''
    if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
        base_alphabet = Alphabet._get_base_alphabet(seq.alphabet)
        if isinstance(base_alphabet, Alphabet.DNAAlphabet):
            tmp_type = 'DNA'
        elif isinstance(base_alphabet, Alphabet.RNAAlphabet):
            tmp_type = 'RNA'
        elif isinstance(base_alphabet, Alphabet.ProteinAlphabet):
            tmp_type = 'protein'
        elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein):
            tmp_type = 'protein'
            # Convert to one-letter sequence. Have to use a string for seq1
            seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet())
        elif not isinstance(base_alphabet, Alphabet.Alphabet):
            raise TypeError("%s is not a valid alphabet for mass calculations"
                             % base_alphabet)
        else:
            tmp_type = "DNA" # backward compatibity
        if seq_type and tmp_type and tmp_type != seq_type:
            raise ValueError("seq_type=%r contradicts %s from seq alphabet"
                             % (seq_type, tmp_type))
        seq_type = tmp_type
    elif isinstance(seq, str):
        if seq_type is None:
            seq_type = "DNA" # backward compatibity
    else:
        raise TypeError("Expected a string or Seq object, not seq=%r" % seq)

    seq = ''.join(str(seq).split()).upper() # Do the minimum formatting

    if seq_type == 'DNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
        else:
            weight_table = IUPACData.unambiguous_dna_weights
    elif seq_type == 'RNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
        else:
            weight_table = IUPACData.unambiguous_rna_weights
    elif seq_type == 'protein':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_protein_weights
        else:
            weight_table = IUPACData.protein_weights
    else:
        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r"
                         % seq_type)

    if monoisotopic:
        water = 18.010565
    else:
        water = 18.0153

    try:
        weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    except KeyError as e:
        raise ValueError('%s is not a valid unambiguous letter for %s'
                         %(e, seq_type))
    except:
        raise

    if seq_type in ('DNA', 'RNA') and double_stranded:
        seq = str(Seq(seq).complement())
        weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    elif seq_type == 'protein' and double_stranded:
        raise ValueError('double-stranded proteins await their discovery')

    return weight
コード例 #27
0
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(record,
                                             "accession",
                                             just_first=True)
        if len(locus) > 16:
            if len(locus) + 1 + len(str(len(record))) > 28:
                # Locus name and record length to long to squeeze in.
                raise ValueError("Locus identifier %r is too long" % locus)
            else:
                warnings.warn(
                    "Stealing space from length field to allow long name in LOCUS line",
                    BiopythonWarning)
        if len(locus.split()) > 1:
            # locus could be unicode, and u'with space' versus 'with space'
            # causes trouble with doctest or print-and-compare tests, so
            tmp = repr(locus)
            if tmp.startswith("u'") and tmp.endswith("'"):
                tmp = tmp[1:]
            raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp)
        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "aa"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "bp"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        topology = self._get_topology(record)

        division = self._get_data_division(record)

        name_length = str(len(record)).rjust(28)
        name_length = locus + name_length[len(locus):]
        assert len(name_length) == 28, name_length
        assert " " in name_length, name_length

        assert len(units) == 2
        assert len(division) == 3
        line = "LOCUS       %s %s    %s %s %s %s\n" \
            % (name_length,
               units,
               mol_type.ljust(7),
               topology,
               division,
               self._get_date(record))
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        # We're bending the rules to allow an identifier over 16 characters
        # if we can steal spaces from the length field:
        # assert line[12:28].rstrip() == locus, \
        #     'LOCUS line does not contain the locus at the expected position:\n' + line
        # assert line[28:29] == " "
        # assert line[29:40].lstrip() == str(len(record)), \
        #     'LOCUS line does not contain the length at the expected position:\n' + line
        assert line[12:40].split() == [locus, str(len(record))], line

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [' bp ', ' aa '], \
            'LOCUS line does not contain size units at expected position:\n' + \
            line
        assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
            'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
        assert line[47:54].strip() == "" \
            or 'DNA' in line[47:54].strip() \
            or 'RNA' in line[47:54].strip(), \
               'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
        assert line[54:55] == ' ', \
            'LOCUS line does not contain space at position 55:\n' + line
        assert line[55:63].strip() in ['', 'linear', 'circular'], \
            'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
        assert line[63:64] == ' ', \
            'LOCUS line does not contain space at position 64:\n' + line
        assert line[67:68] == ' ', \
            'LOCUS line does not contain space at position 68:\n' + line
        assert line[70:71] == '-', \
            'LOCUS line does not contain - at position 71 in date:\n' + line
        assert line[74:75] == '-', \
            'LOCUS line does not contain - at position 75 in date:\n' + line

        self.handle.write(line)
コード例 #28
0
ファイル: AlignInfo.py プロジェクト: JulianNymark/blendergame
    def information_content(self, start=0,
                            end=None,
                            e_freq_table=None, log_base=2,
                            chars_to_ignore=[]):
        """Calculate the information content for each residue along an alignment.

        Arguments:
            - start, end - The starting an ending points to calculate the
              information content. These points should be relative to the first
              sequence in the alignment, starting at zero (ie. even if the 'real'
              first position in the seq is 203 in the initial sequence, for
              the info content, we need to use zero). This defaults to the entire
              length of the first sequence.
            - e_freq_table - A FreqTable object specifying the expected frequencies
              for each letter in the alphabet we are using (e.g. {'G' : 0.4,
              'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
              included, since these should not have expected frequencies.
            - log_base - The base of the logathrim to use in calculating the
              information content. This defaults to 2 so the info is in bits.
            - chars_to_ignore - A listing of characterw which should be ignored
              in calculating the info content.

        Returns:
            - A number representing the info content for the specified region.

        Please see the Biopython manual for more information on how information
        content is calculated.
        """
        # if no end was specified, then we default to the end of the sequence
        if end is None:
            end = len(self.alignment._records[0].seq)

        if start < 0 or end > len(self.alignment._records[0].seq):
            raise ValueError("Start (%s) and end (%s) are not in the \
                    range %s to %s"
                    % (start, end, 0, len(self.alignment._records[0].seq)))
        # determine random expected frequencies, if necessary
        random_expected = None
        if not e_freq_table:
            # TODO - What about ambiguous alphabets?
            base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
            if isinstance(base_alpha, Alphabet.ProteinAlphabet):
                random_expected = Protein20Random
            elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
                random_expected = Nucleotide4Random
            else:
                errstr = "Error in alphabet: not Nucleotide or Protein, "
                errstr += "supply expected frequencies"
                raise ValueError(errstr)
            del base_alpha
        elif not isinstance(e_freq_table, FreqTable.FreqTable):
            raise ValueError("e_freq_table should be a FreqTable object")

        # determine all of the letters we have to deal with
        all_letters = self._get_all_letters()
        for char in chars_to_ignore:
            all_letters = all_letters.replace(char, '')

        info_content = {}
        for residue_num in range(start, end):
            freq_dict = self._get_letter_freqs(residue_num,
                                               self.alignment._records,
                                               all_letters, chars_to_ignore)
            # print freq_dict,
            column_score = self._get_column_info_content(freq_dict,
                                                         e_freq_table,
                                                         log_base,
                                                         random_expected)

            info_content[residue_num] = column_score
        # sum up the score
        total_info = sum(info_content.values())
        # fill in the ic_vector member: holds IC for each column
        for i in info_content:
            self.ic_vector[i] = info_content[i]
        return total_info
コード例 #29
0
ファイル: InsdcIO.py プロジェクト: GJOHNSON2003/biopython
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(
                record, "accession", just_first=True)
        if len(locus) > 16:
            raise ValueError("Locus identifier %r is too long" % str(locus))

        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "aa"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "bp"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        division = self._get_data_division(record)

        assert len(units) == 2
        assert len(division) == 3
        # TODO - date
        # TODO - mol_type
        line = "LOCUS       %s %s %s    %s           %s %s\n" \
            % (locus.ljust(16),
               str(len(record)).rjust(11),
               units,
               mol_type.ljust(6),
               division,
               self._get_date(record))
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        assert line[12:28].rstrip() == locus, \
            'LOCUS line does not contain the locus at the expected position:\n' + line
        assert line[28:29] == " "
        assert line[29:40].lstrip() == str(len(record)), \
            'LOCUS line does not contain the length at the expected position:\n' + line

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [' bp ', ' aa '], \
            'LOCUS line does not contain size units at expected position:\n' + \
            line
        assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
            'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
        assert line[47:54].strip() == "" \
            or 'DNA' in line[47:54].strip() \
            or 'RNA' in line[47:54].strip(), \
               'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
        assert line[54:55] == ' ', \
            'LOCUS line does not contain space at position 55:\n' + line
        assert line[55:63].strip() in ['', 'linear', 'circular'], \
            'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
        assert line[63:64] == ' ', \
            'LOCUS line does not contain space at position 64:\n' + line
        assert line[67:68] == ' ', \
            'LOCUS line does not contain space at position 68:\n' + line
        assert line[70:71] == '-', \
            'LOCUS line does not contain - at position 71 in date:\n' + line
        assert line[74:75] == '-', \
            'LOCUS line does not contain - at position 75 in date:\n' + line

        self.handle.write(line)
コード例 #30
0
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
            version = "SV " + record.id.rsplit(".", 1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".", 1)[0],
                                                 just_first=True)
        else :
            version = "XXX"
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)
        
        if ";" in accession :
            raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
                             % repr(accession))
        if " " in accession :
            #This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s" \
                             % repr(accession))

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif not isinstance(a, Alphabet.NucleotideAlphabet):
            raise ValueError("Need a Nucleotide alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            #Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA or RNA alphabet")

        #Get the topology -- circular or linear
        if 'topology' in record.annotations:
            topology = record.annotations['topology']
            if topology not in ['linear', 'circular']:
                raise ValueError("Cannot have '%s' for topology in EMBL ID line, must be 'circular' or 'linear'" % topology)
        else:
            topology = 'linear' # default topology

        #Get the taxonomy division
        division = self._get_data_division(record)
        
        #Get Data class
        data_class = self._get_data_class(record)

        #Full ID line
        #ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
        #1. Primary accession number
        #2. Sequence version number
        #3. Topology: 'circular' or 'linear'
        #4. Molecule type (see note 1 below)
        #5. Data class (see section 3.1)
        #6. Taxonomic division (see section 3.2)
        #7. Sequence length (see note 2 below)
        #All tokens that are non-mandatory can be represented by a universal placeholder "XXX", 
        #so in the ID line in the new submission can look as follows:
        #ID   XXX; XXX; linear; XXX; XXX; XXX; 500 BP.
        handle = self.handle
        self._write_single_line("ID", "%s; %s; %s; %s; %s; %s; %i BP." \
                                % (accession, version, topology, mol_type, data_class, division, len(record)))
        handle.write("XX\n")
        self._write_single_line("AC", accession+";")
        handle.write("XX\n")
コード例 #31
0
ファイル: test_seq.py プロジェクト: frankkl/biopython
            assert (isinstance(a,str) or isinstance(b,str)), \
                   "Nucleotide+Protein addition should fail!"
        except TypeError :
            pass

###########################################################################
print
print "Testing Seq string methods"
print "=========================="
for a in dna + rna + nuc + protein :
    if not isinstance(a, Seq.Seq) : continue
    assert a.strip().tostring() == a.tostring().strip()
    assert a.lstrip().tostring() == a.tostring().lstrip()
    assert a.rstrip().tostring() == a.tostring().rstrip()
    test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
    alpha = Alphabet._get_base_alphabet(a.alphabet)
    if isinstance(alpha, Alphabet.DNAAlphabet) :
        test_chars.append(Seq.Seq("A", IUPAC.ambiguous_dna))
    if isinstance(alpha, Alphabet.RNAAlphabet) :
        test_chars.append(Seq.Seq("A", IUPAC.ambiguous_rna))
    if isinstance(alpha, Alphabet.NucleotideAlphabet) :
        test_chars.append(Seq.Seq("A", Alphabet.generic_nucleotide))
    if isinstance(alpha, Alphabet.ProteinAlphabet) :
        test_chars.append(Seq.Seq("K", Alphabet.generic_protein))
        test_chars.append(Seq.Seq("K-", Alphabet.Gapped(Alphabet.generic_protein,"-")))
        test_chars.append(Seq.Seq("K@", Alphabet.Gapped(IUPAC.protein,"@")))
        #Setup a clashing alphabet sequence
        b = Seq.Seq("-", Alphabet.generic_nucleotide)
    else :
        b = Seq.Seq("-", Alphabet.generic_protein)
    try :
コード例 #32
0
ファイル: AbiIO.py プロジェクト: DunbrackLab/biopython
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id, name=file_name,
                       description='',
                       annotations=annot,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
コード例 #33
0
ファイル: test_SeqIO.py プロジェクト: BIGLabHYU/biopython
    # Check Bio.SeqIO.read(...)
    if t_count == 1:
        record = SeqIO.read(t_filename, format=t_format)
        assert isinstance(record, SeqRecord)
    else:
        try:
            record = SeqIO.read(t_filename, t_format)
            assert False, "Bio.SeqIO.read(...) should have failed"
        except ValueError:
            # Expected to fail
            pass

    # Check alphabets
    for record in records:
        base_alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(base_alpha, Alphabet.SingleLetterAlphabet):
            if t_format in no_alpha_formats:
                # Too harsh?
                assert base_alpha == Alphabet.single_letter_alphabet
        else:
            base_alpha = None
    if base_alpha is None:
        good = []
        bad = []
        given_alpha = None
    elif isinstance(base_alpha, Alphabet.ProteinAlphabet):
        good = protein_alphas
        bad = dna_alphas + rna_alphas + nucleotide_alphas
    elif isinstance(base_alpha, Alphabet.RNAAlphabet):
        good = nucleotide_alphas + rna_alphas
コード例 #34
0
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(record, "accession", just_first=True)
        if len(locus) > 16:
            raise ValueError("Locus identifier %s is too long" % repr(locus))

        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "bp"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "aa"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        try:
            division = record.annotations["data_file_division"]
        except KeyError:
            division = "UNK"
        if division not in [
            "PRI",
            "ROD",
            "MAM",
            "VRT",
            "INV",
            "PLN",
            "BCT",
            "VRL",
            "PHG",
            "SYN",
            "UNA",
            "EST",
            "PAT",
            "STS",
            "GSS",
            "HTG",
            "HTC",
            "ENV",
        ]:
            division = "UNK"

        assert len(units) == 2
        assert len(division) == 3
        # TODO - date
        # TODO - mol_type
        line = "LOCUS       %s %s %s    %s           %s 01-JAN-1980\n" % (
            locus.ljust(16),
            str(len(record)).rjust(11),
            units,
            mol_type.ljust(6),
            division,
        )
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        assert line[12:28].rstrip() == locus, "LOCUS line does not contain the locus at the expected position:\n" + line
        assert line[28:29] == " "
        assert line[29:40].lstrip() == str(len(record)), (
            "LOCUS line does not contain the length at the expected position:\n" + line
        )

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [" bp ", " aa "], "LOCUS line does not contain size units at expected position:\n" + line
        assert line[44:47] in ["   ", "ss-", "ds-", "ms-"], (
            "LOCUS line does not have valid strand type (Single stranded, ...):\n" + line
        )
        assert (
            line[47:54].strip() == "" or line[47:54].strip().find("DNA") != -1 or line[47:54].strip().find("RNA") != -1
        ), ("LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n" + line)
        assert line[54:55] == " ", "LOCUS line does not contain space at position 55:\n" + line
        assert line[55:63].strip() in ["", "linear", "circular"], (
            "LOCUS line does not contain valid entry (linear, circular, ...):\n" + line
        )
        assert line[63:64] == " ", "LOCUS line does not contain space at position 64:\n" + line
        assert line[67:68] == " ", "LOCUS line does not contain space at position 68:\n" + line
        assert line[70:71] == "-", "LOCUS line does not contain - at position 71 in date:\n" + line
        assert line[74:75] == "-", "LOCUS line does not contain - at position 75 in date:\n" + line

        self.handle.write(line)
コード例 #35
0
ファイル: test_seq.py プロジェクト: wl2wl2/biopython
            pass

###########################################################################
print
print "Testing Seq string methods"
print "=========================="
for a in dna + rna + nuc + protein:
    if not isinstance(a, Seq.Seq):
        continue
    assert str(a.strip()) == str(a).strip()
    assert str(a.lstrip()) == str(a).lstrip()
    assert str(a.rstrip()) == str(a).rstrip()
    assert str(a.lower()) == str(a).lower()
    assert str(a.upper()) == str(a).upper()
    test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
    alpha = Alphabet._get_base_alphabet(a.alphabet)
    if isinstance(alpha, Alphabet.DNAAlphabet):
        test_chars.append(Seq.Seq("A", IUPAC.ambiguous_dna))
    if isinstance(alpha, Alphabet.RNAAlphabet):
        test_chars.append(Seq.Seq("A", IUPAC.ambiguous_rna))
    if isinstance(alpha, Alphabet.NucleotideAlphabet):
        test_chars.append(Seq.Seq("A", Alphabet.generic_nucleotide))
    if isinstance(alpha, Alphabet.ProteinAlphabet):
        test_chars.append(Seq.Seq("K", Alphabet.generic_protein))
        test_chars.append(
            Seq.Seq("K-", Alphabet.Gapped(Alphabet.generic_protein, "-")))
        test_chars.append(Seq.Seq("K@", Alphabet.Gapped(IUPAC.protein, "@")))
        #Setup a clashing alphabet sequence
        b = Seq.Seq("-", Alphabet.generic_nucleotide)
    else:
        b = Seq.Seq("-", Alphabet.generic_protein)
コード例 #36
0
ファイル: InsdcIO.py プロジェクト: Oli4/biopython
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(
                record, "accession", just_first=True)
        if len(locus) > 16:
            if len(locus) + 1 + len(str(len(record))) > 28:
                # Locus name and record length to long to squeeze in.
                raise ValueError("Locus identifier %r is too long" % locus)
            else:
                warnings.warn("Stealing space from length field to allow long name in LOCUS line", BiopythonWarning)
        if len(locus.split()) > 1:
            # locus could be unicode, and u'with space' versus 'with space'
            # causes trouble with doctest or print-and-compare tests, so
            tmp = repr(locus)
            if tmp.startswith("u'") and tmp.endswith("'"):
                tmp = tmp[1:]
            raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp)
        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "aa"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "bp"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        topology = self._get_topology(record)

        division = self._get_data_division(record)

        name_length = str(len(record)).rjust(28)
        name_length = locus + name_length[len(locus):]
        assert len(name_length) == 28, name_length
        assert " " in name_length, name_length

        assert len(units) == 2
        assert len(division) == 3
        line = "LOCUS       %s %s    %s %s %s %s\n" \
            % (name_length,
               units,
               mol_type.ljust(7),
               topology,
               division,
               self._get_date(record))
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        # We're bending the rules to allow an identifier over 16 characters
        # if we can steal spaces from the length field:
        # assert line[12:28].rstrip() == locus, \
        #     'LOCUS line does not contain the locus at the expected position:\n' + line
        # assert line[28:29] == " "
        # assert line[29:40].lstrip() == str(len(record)), \
        #     'LOCUS line does not contain the length at the expected position:\n' + line
        assert line[12:40].split() == [locus, str(len(record))], line

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [' bp ', ' aa '], \
            'LOCUS line does not contain size units at expected position:\n' + \
            line
        assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
            'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
        assert line[47:54].strip() == "" \
            or 'DNA' in line[47:54].strip() \
            or 'RNA' in line[47:54].strip(), \
               'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
        assert line[54:55] == ' ', \
            'LOCUS line does not contain space at position 55:\n' + line
        assert line[55:63].strip() in ['', 'linear', 'circular'], \
            'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
        assert line[63:64] == ' ', \
            'LOCUS line does not contain space at position 64:\n' + line
        assert line[67:68] == ' ', \
            'LOCUS line does not contain space at position 68:\n' + line
        assert line[70:71] == '-', \
            'LOCUS line does not contain - at position 71 in date:\n' + line
        assert line[74:75] == '-', \
            'LOCUS line does not contain - at position 75 in date:\n' + line

        self.handle.write(line)