Ejemplo n.º 1
0
    def test_read_fasta(self):
        path = os.path.join(os.curdir, "Quality", "example.fasta")
        alignment = AlignIO.read(path,
                                 "fasta",
                                 alphabet=Alphabet.Gapped(IUPAC.ambiguous_dna))
        self.assertEqual(len(alignment), 3)
        seq_record = alignment[0]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_413_324")
        self.assertEqual(seq_record.seq, "CCCTTCTTGTCTTCAGCGTTTCTCC")
        seq_record = alignment[1]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_540_792")
        self.assertEqual(seq_record.seq, "TTGGCAGGCCAAGGCCGATGGATCA")
        seq_record = alignment[2]
        self.assertEqual(seq_record.description, "EAS54_6_R1_2_1_443_348")
        self.assertEqual(seq_record.seq, "GTTGCTTCTGGCGTGGGTGGGGGGG")
        self.assertEqual(alignment.get_alignment_length(), 25)
        align_info = AlignInfo.SummaryInfo(alignment)
        consensus = align_info.dumb_consensus(ambiguous="N", threshold=0.6)
        self.assertIsInstance(consensus, Seq)
        self.assertEqual(consensus, "NTNGCNTNNNNNGNNGGNTGGNTCN")
        self.assertEqual(
            str(alignment), """\
Alignment with 3 rows and 25 columns
CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_413_324
TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_540_792
GTTGCTTCTGGCGTGGGTGGGGGGG EAS54_6_R1_2_1_443_348""")
Ejemplo n.º 2
0
def remove_gapped_positions_codon(aln_file, output = None, in_format = "fasta"):
    """
    removes positions in an alignment which are all gapped
    if output == None - rewrites on the input file
    :param aln_file: input alignment file path
    :param output: output file path (default: None)
    :param in_format: input format (default: fatsa)
    :return: ouptut file path
    """
    aln_file = check_filename(aln_file)
    if output == None:
        output = aln_file
    else:
        output = check_filename(output, Truefile=False)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    new_aln = None
    for i in range(0, len(aln[0]), 3):
        position = aln[:, i:i+3]
        if "".join(set(position[0])) != "-" or "".join(set(position[2])) != "-" or "".join(set(position[2])) != "-":
            if new_aln == None:
                new_aln = aln[:, i:i+3]
            else:
                new_aln = new_aln + aln[:, i:i+3]

    AlignIO.write(new_aln, output, "fasta")
Ejemplo n.º 3
0
def replace_stop_codons_with_gapps(aln_file, in_format="fasta", output=None):
    aln_file = check_filename(aln_file)
    if output == None:
        output = aln_file
    else:
        output = check_filename(output, Truefile=False)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    stop_codon_count = 0
    for seq in aln:
        new_seq = ""
        for i in range(0, len(seq.seq), 3):
            codon = seq.seq[i:i + 3]
            if "-" in codon:
                new_seq += codon

            elif codon in ["TAA", "TAG", "TGA"]:
                if len(seq.seq) - i == 3:  # the final stop codon
                    new_seq += "---"
                else:
                    new_seq += "---"
                    stop_codon_count += 1
            else:
                new_seq += codon
        seq.seq = new_seq
    SeqIO.write(aln, output, "fasta")
    print("%i replacments of stop codons to ---" % stop_codon_count)
Ejemplo n.º 4
0
    def _write_seq(self, record):
        """Write the sequence.

        Note that SeqXML requires a DNA, RNA or protein alphabet.
        """
        if isinstance(record.seq, UnknownSeq):
            raise TypeError(
                "Sequence type is UnknownSeq but SeqXML requires sequence")

        seq = str(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alpha, Alphabet.RNAAlphabet):
            seqElem = "RNAseq"
        elif isinstance(alpha, Alphabet.DNAAlphabet):
            seqElem = "DNAseq"
        elif isinstance(alpha, Alphabet.ProteinAlphabet):
            seqElem = "AAseq"
        else:
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)
Ejemplo n.º 5
0
 def __init__(self,
              elem,
              alphabet=Alphabet.ProteinAlphabet(),
              return_raw_comments=False):
     self.entry = elem
     self.alphabet = alphabet
     self.return_raw_comments = return_raw_comments
Ejemplo n.º 6
0
def test_generate():
    ffname = 'test'
    from Bio import Alphabet
    alphabet = Alphabet.ProteinAlphabet()
    alphabet.size = 3
    alphabet.letters = ['BB1', 'BB2']
    inferAngles = True
    topPath = testFilePath

    result = ffparsergmx.generate(ffname, [alphabet],
                                  inferAngles,
                                  topPath=topPath)
    assert result['BB1']['vertices'] == [('A1', 'A'), ('A2', 'A'), ('A3', 'A'),
                                         ('A4', 'A')]

    assert result['BB1']['bondEdges'][('A1', 'A2')] == approx(1.2)
    assert result['BB1']['bondEdges'][('A2', 'A3')] == approx(1.0)
    assert result['BB1']['bondEdges'][('A3', 'A4')] == approx(1.1)
    assert result['BB1']['angleEdges'][('A1',
                                        'A3')] == approx(1.90787884028338913,
                                                         rel=1e-5)
    assert result['BB1']['angleEdges'][('A2',
                                        'A4')] == approx(1.7719368430701863,
                                                         rel=1e-5)
    assert result['BB1']['improperEdges']['A1',
                                          'A4'] == approx(2.065313144262336)

    return
Ejemplo n.º 7
0
    def _write_seq(self, record):
        """Write the sequence.

        Note that SeqXML requires a DNA, RNA or protein alphabet.
        """

        if isinstance(record.seq, UnknownSeq):
            raise TypeError("Sequence type is UnknownSeq but SeqXML requires sequence")

        seq = str(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(alpha, Alphabet.RNAAlphabet):
            seqElem = "RNAseq"
        elif isinstance(alpha, Alphabet.DNAAlphabet):
            seqElem = "DNAseq"
        elif isinstance(alpha, Alphabet.ProteinAlphabet):
            seqElem = "AAseq"
        else:
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)
Ejemplo n.º 8
0
def main():
    p = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    p.add_argument('vsearch', help='vsearch alignments')
    p.add_argument('fasta', help='vsearch fasta file')
    p.add_argument(
        '--unknowns',
        required=True,  # mimics ``taxit update_taxids``
        metavar='fasta',
        help=('fasta format output of sequences not '
              'aligned or with invalid sequence characters'))
    p.add_argument('--out',
                   default=sys.stdout,
                   metavar='fasta',
                   help='fasta output of sequences in forward orientation')

    args = p.parse_args()

    vsearch = (row.split('\t') for row in open(args.vsearch))
    vsearch = {row[0]: row[2] for row in vsearch if row[1] != '*'}

    with open(args.out, 'w') as out, open(args.unknowns, 'w') as unknowns:
        seqs = SeqIO.parse(args.fasta, 'fasta', Alphabet.IUPAC.ambiguous_dna)
        for s in seqs:
            if s.id in vsearch and Alphabet._verify_alphabet(s.seq):
                if vsearch[s.id] == '-':
                    s.seq = s.seq.reverse_complement()
                out.write('>{}\n{}\n'.format(s.description, s.seq))
            else:
                unknowns.write('>{}\n{}\n'.format(s.description, s.seq))
Ejemplo n.º 9
0
def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
    """Generator function to parse UniProt XML as SeqRecord objects.

    parses an XML entry at a time from any UniProt XML file
    returns a SeqRecord for each iteration

    This generator can be used in Bio.SeqIO

    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
    """
    if isinstance(alphabet, Alphabet.NucleotideAlphabet):
        raise ValueError("Wrong alphabet %r" % alphabet)
    if isinstance(alphabet, Alphabet.Gapped):
        if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
            raise ValueError("Wrong alphabet %r" % alphabet)

    if not hasattr(handle, "read"):
        if isinstance(handle, str):
            handle = StringIO(handle)
        else:
            raise Exception('An XML-containing handler or an XML string must be passed')

    if ElementTree is None:
        from Bio import MissingExternalDependencyError
        raise MissingExternalDependencyError(
                "No ElementTree module was found. "
                "Use Python 2.5+, lxml or elementtree if you "
                "want to use Bio.SeqIO.UniprotIO.")

    for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
        if event == "end" and elem.tag == NS + "entry":
            yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
            elem.clear()
def printMSA(MSA):
    """ A pretty print of an MSA on the terminal.

	Args:
		MSA: (array): an array of aligned string sequences
	
	"""

    Alphabet = list("-ARNDCQEGHILKMFPSTWYVBZX12345678*")

    # Create the palette of possible foreground background combinations
    col1 = [("grey", []), ("red", []), ("green", []), ("yellow", []),
            ("blue", []), ("magenta", []), ("cyan", []), ("white", [])]
    colors = col1
    for i in range(0, len(col1)):
        color1 = col1[i][0]
        for j in range(0, len(col1)):
            color2 = col1[j][0]
            if color1 != color2: colors.append((color1, "on_" + color2))
            if len(colors) >= 32: break

    for sequence in MSA:
        text = ""
        for c in sequence:
            CL = colors[Alphabet.index(c)]
            if len(CL[1]) < 1: text += colored(c, CL[0])
            else: text += colored(c, CL[0], CL[1])
        print text
Ejemplo n.º 11
0
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
            version = "SV " + record.id.rsplit(".", 1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".", 1)[0],
                                                 just_first=True)
        else :
            version = ""
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)
        
        if ";" in accession :
            raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
                             % repr(str(accession)))
        if " " in accession :
            #This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s" \
                             % repr(str(accession)))

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
            units = "BP"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
            units = "BP"
        elif isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = "PROTEIN"
            units = "AA"
        else:
            #Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        #Get the taxonomy division
        division = self._get_data_division(record)

        #TODO - Full ID line
        handle = self.handle
        #ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
        #1. Primary accession number
        #2. Sequence version number
        #3. Topology: 'circular' or 'linear'
        #4. Molecule type
        #5. Data class
        #6. Taxonomic division
        #7. Sequence length
        self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." \
                                % (accession, version, mol_type,
                                   division, len(record), units))
        handle.write("XX\n")
        self._write_single_line("AC", accession+";")
        handle.write("XX\n")
Ejemplo n.º 12
0
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
            version = "SV " + record.id.rsplit(".", 1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".", 1)[0],
                                                 just_first=True)
        else:
            version = ""
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)

        if ";" in accession:
            raise ValueError("Cannot have semi-colon in EMBL accession, %s"
                             % repr(str(accession)))
        if " " in accession:
            # This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s"
                             % repr(str(accession)))

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
            units = "BP"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
            units = "BP"
        elif isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = "PROTEIN"
            units = "AA"
        else:
            # Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        # Get the taxonomy division
        division = self._get_data_division(record)

        # TODO - Full ID line
        handle = self.handle
        # ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
        # 1. Primary accession number
        # 2. Sequence version number
        # 3. Topology: 'circular' or 'linear'
        # 4. Molecule type
        # 5. Data class
        # 6. Taxonomic division
        # 7. Sequence length
        self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s."
                                % (accession, version, mol_type,
                                   division, len(record), units))
        handle.write("XX\n")
        self._write_single_line("AC", accession + ";")
        handle.write("XX\n")
Ejemplo n.º 13
0
    def __init__(self,
                 data=None,
                 alphabet=None,
                 mat_type=NOTYPE,
                 mat_name='',
                 build_later=0):
        # User may supply:
        # data: matrix itself
        # mat_type: its type. See below
        # mat_name: its name. See below.
        # alphabet: an instance of Bio.Alphabet, or a subclass. If not
        # supplied, constructor builds its own from that matrix."""
        # build_later: skip the matrix size assertion. User will build the
        # matrix after creating the instance. Constructor builds a half matrix
        # filled with zeroes.

        assert type(mat_type) == type(1)
        assert type(mat_name) == type('')

        # "data" may be:
        # 1) None --> then self.data is an empty dictionary
        # 2) type({}) --> then self.data takes the items in data
        # 3) An instance of SeqMat
        # This whole creation-during-execution is done to avoid changing
        # default values, the way Python does because default values are
        # created when the function is defined, not when it is created.
        assert (type(data) == type({}) or isinstance(data, dict)
                or data == None)
        if data == None:
            data = {}
        else:
            self.update(data)
        if alphabet == None:
            alphabet = Alphabet.Alphabet()
        assert Alphabet.generic_alphabet.contains(alphabet)
        self.alphabet = alphabet

        # If passed alphabet is empty, use the letters in the matrix itself
        if not self.alphabet.letters:
            self._alphabet_from_matrix()
        # Assert matrix size: half or full
        if not build_later:
            N = len(self.alphabet.letters)
            assert len(self) == N**2 or len(self) == N * (N + 1) / 2
        self.ab_list = list(self.alphabet.letters)
        self.ab_list.sort()
        # type can be: ACCREP, OBSFREQ, SUBS, EXPFREQ, LO
        self.mat_type = mat_type
        # Names: a string like "BLOSUM62" or "PAM250"
        self.mat_name = mat_name
        if build_later:
            self._init_zero()
        else:
            # Convert full to half if matrix is not already a log-odds matrix
            if self.mat_type != LO:
                self._full_to_half()
            self._correct_matrix()
        self.sum_letters = {}
        self.relative_entropy = 0
Ejemplo n.º 14
0
def is_valid_sequence(s):
    rec = SeqRecord(Seq(s.upper().replace('T', 'U'), IUPAC.unambiguous_rna),
                    id="RNA")
    if not Alphabet._verify_alphabet(rec.seq):
        raise RuntimeError(
            "Invalid nucleotide sequence, unknown characters in input string {}"
            .format(s))
    return rec
Ejemplo n.º 15
0
 def __init__(self,
              elem,
              alphabet=Alphabet.ProteinAlphabet(),
              return_raw_comments=False):
     """Initialize the class."""
     self.entry = elem
     self.alphabet = alphabet
     self.return_raw_comments = return_raw_comments
Ejemplo n.º 16
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence (PRIVATE).

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        # Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        # Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            # Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError(
                    "Alignment contains a sequence with an incompatible alphabet."
                )

        # Check the ambiguous character we are going to use in the consensus
        # is in the alphabet's list of valid letters (if defined).
        if (
            hasattr(a, "letters")
            and a.letters is not None
            and ambiguous not in a.letters
        ):
            # We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
Ejemplo n.º 17
0
def action(arguments):
    """
    Trim the alignment as specified
    """
    # Determine file format for input and output
    source_format = (arguments.source_format
                     or fileformat.from_handle(arguments.source_file))
    output_format = (arguments.output_format
                     or fileformat.from_handle(arguments.output_file))

    # Load the alignment
    with arguments.source_file:
        sequences = SeqIO.parse(arguments.source_file,
                                source_format,
                                alphabet=Alphabet.Gapped(
                                    Alphabet.single_letter_alphabet))

        # Locate primers
        (forward_start, forward_end), (reverse_start, reverse_end) = \
                locate_primers(sequences, arguments.forward_primer,
                        arguments.reverse_primer, arguments.reverse_complement,
                        arguments.max_hamming_distance)

        # Generate slice indexes
        if arguments.include_primers:
            start = forward_start
            end = reverse_end + 1
        else:
            start = forward_end + 1
            end = reverse_start

        # Rewind the input file
        arguments.source_file.seek(0)
        sequences = SeqIO.parse(arguments.source_file,
                                source_format,
                                alphabet=Alphabet.Gapped(
                                    Alphabet.single_letter_alphabet))

        # Apply the transformation
        prune_action = _ACTIONS[arguments.prune_action]
        transformed_sequences = prune_action(sequences, start, end)

        with arguments.output_file:
            SeqIO.write(transformed_sequences, arguments.output_file,
                        output_format)
Ejemplo n.º 18
0
    def _write_sequence(self, record):
        LETTERS_PER_BLOCK = 10
        BLOCKS_PER_LINE = 6
        LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE
        POSITION_PADDING = 10
        handle = self.handle  # save looking up this multiple times

        if isinstance(record.seq, UnknownSeq):
            # We have already recorded the length, and there is no need
            # to record a long sequence of NNNNNNN...NNN or whatever.
            if "contig" in record.annotations:
                self._write_contig(record)
            else:
                # TODO - Can the sequence just be left out as in GenBank files?
                handle.write("SQ   \n")
            return

        # Catches sequence being None
        data = self._get_seq_string(record).lower()
        seq_len = len(data)

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(a, Alphabet.DNAAlphabet):
            # TODO - What if we have RNA?
            a_count = data.count('A') + data.count('a')
            c_count = data.count('C') + data.count('c')
            g_count = data.count('G') + data.count('g')
            t_count = data.count('T') + data.count('t')
            other = seq_len - (a_count + c_count + g_count + t_count)
            handle.write(
                "SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" %
                (seq_len, a_count, c_count, g_count, t_count, other))
        else:
            handle.write("SQ   \n")

        for line_number in range(0, seq_len // LETTERS_PER_LINE):
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK]))
            handle.write(
                str((line_number + 1) *
                    LETTERS_PER_LINE).rjust(POSITION_PADDING))
            handle.write("\n")
        if seq_len % LETTERS_PER_LINE:
            # Final (partial) line
            line_number = (seq_len // LETTERS_PER_LINE)
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write(
                    (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11))
            handle.write(str(seq_len).rjust(POSITION_PADDING))
            handle.write("\n")
Ejemplo n.º 19
0
    def append(self, record):
        """Add one more SeqRecord object to the alignment as a new row.

        This must have the same length as the original alignment (unless this is
        the first record), and have an alphabet compatible with the alignment's
        alphabet.

        >>> from Bio import AlignIO
        >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
        >>> print align
        SingleLetterAlphabet() alignment with 7 rows and 156 columns
        TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
        TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
        TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
        TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
        TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
        TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
        TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
        >>> len(align)
        7

        We'll now construct a dummy record to append as an example:

        >>> from Bio.Seq import Seq
        >>> from Bio.SeqRecord import SeqRecord
        >>> dummy = SeqRecord(Seq("N"*156), id="dummy")

        Now append this to the alignment,

        >>> align.append(dummy)
        >>> print align
        SingleLetterAlphabet() alignment with 8 rows and 156 columns
        TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
        TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
        TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
        TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
        TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
        TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
        TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
        NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy
        >>> len(align)
        8

        """
        if not isinstance(record, SeqRecord):
            raise TypeError("New sequence is not a SeqRecord object")
        if self._records and len(record) != self.get_alignment_length():
            #TODO - Use the following more helpful error, but update unit tests
            #raise ValueError("New sequence is not of length %i" \
            #                 % self.get_alignment_length())
            raise ValueError("Sequences must all be the same length")
        #Using not self.alphabet.contains(record.seq.alphabet) needs fixing
        #for AlphabetEncoders (e.g. gapped versus ungapped).
        if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]):
            raise ValueError("New sequence's alphabet is incompatible")
        self._records.append(record)
Ejemplo n.º 20
0
    def __init__(self, records, alphabet=None):
        """Initialize a new MultipleSeqAlignment object.

        Arguments:
        records - A list (or iterator) of SeqRecord objects, whose sequences
                  are all the same length.  This may be an be an empty list.
        alphabet - The alphabet for the whole alignment, typically a gapped
                  alphabet, which should be a super-set of the individual
                  record alphabets.  If omitted, a consensus alphabet is used.

        You would normally load a MSA from a file using Bio.AlignIO, but you
        can do this from a list of SeqRecord objects too:

        >>> from Bio.Alphabet import generic_dna
        >>> from Bio.Seq import Seq
        >>> from Bio.SeqRecord import SeqRecord
        >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
        >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
        >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
        >>> align = MultipleSeqAlignment([a, b, c])
        >>> print align
        DNAAlphabet() alignment with 3 rows and 7 columns
        AAAACGT Alpha
        AAA-CGT Beta
        AAAAGGT Gamma

        NOTE - The older Bio.Align.Generic.Alignment class only accepted a
        single argument, an alphabet.  This is still supported via a backwards
        compatible "hack" so as not to disrupt existing scripts and users, but
        this will in future be deprecated.
        """
        if isinstance(records, Alphabet.Alphabet) \
        or isinstance(records, Alphabet.AlphabetEncoder):
            if alphabet is None:
                #TODO - Deprecate this backwards compatible mode!                
                alphabet = records
                records = []
            else :
                raise ValueError("Invalid records argument")
        if alphabet is not None :
            if not (isinstance(alphabet, Alphabet.Alphabet) \
            or isinstance(alphabet, Alphabet.AlphabetEncoder)):
                raise ValueError("Invalid alphabet argument")
            self._alphabet = alphabet
        else :
            #Default while we add sequences, will take a consensus later
            self._alphabet = Alphabet.single_letter_alphabet

        self._records = []
        if records:
            self.extend(records)
            if alphabet is None:
                #No alphabet was given, take a consensus alphabet
                self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for \
                                                              rec in self._records \
                                                              if rec.seq is not None)
Ejemplo n.º 21
0
def seqcategory(oneseq):
    seqtype = ''
    seqDNA = Seq(oneseq, IUPACAmbiguousDNA(
    ))  #Produce a sequence using the string received and the DNA alphabet.
    seqRNA = Seq(oneseq, IUPACAmbiguousRNA(
    ))  #Produce a sequence using the string received and the RNA alphabet.
    seqProt = Seq(oneseq, ExtendedIUPACProtein(
    ))  #Produce a sequence using the string received and the protein alphabet.
    if Alphabet._verify_alphabet(seqDNA):  #Verify if is a DNA sequence.
        seqtype = 'DNA'
    elif Alphabet._verify_alphabet(seqRNA):  #Verify if is a RNA sequence.
        seqtype = 'RNA'
    else:
        if Alphabet._verify_alphabet(
                seqProt):  #Verify if is a protein sequence.
            seqtype = 'protein'
        else:
            seqtype = 'noseq'  #If any, is not a valid sequence.
    return seqtype
Ejemplo n.º 22
0
def read_fasta(filename):
    """
    Reading .fasta files
    Input: filename - name of the file
    Output: ndarray
    """
    msa = AlignIO.read(filename,
                       'fasta',
                       alphabet=Alphabet.Gapped(Alphabet.IUPAC.protein))
    return np.array([list(rec) for rec in msa], np.character)
Ejemplo n.º 23
0
    def _write_sequence(self, record):
        LETTERS_PER_BLOCK = 10
        BLOCKS_PER_LINE = 6
        LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE
        POSITION_PADDING = 10
        handle = self.handle  # save looking up this multiple times

        if isinstance(record.seq, UnknownSeq):
            # We have already recorded the length, and there is no need
            # to record a long sequence of NNNNNNN...NNN or whatever.
            if "contig" in record.annotations:
                self._write_contig(record)
            else:
                # TODO - Can the sequence just be left out as in GenBank files?
                handle.write("SQ   \n")
            return

        # Catches sequence being None
        data = self._get_seq_string(record).lower()
        seq_len = len(data)

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(a, Alphabet.DNAAlphabet):
            # TODO - What if we have RNA?
            a_count = data.count('A') + data.count('a')
            c_count = data.count('C') + data.count('c')
            g_count = data.count('G') + data.count('g')
            t_count = data.count('T') + data.count('t')
            other = seq_len - (a_count + c_count + g_count + t_count)
            handle.write("SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n"
                         % (seq_len, a_count, c_count, g_count, t_count, other))
        else:
            handle.write("SQ   \n")

        for line_number in range(0, seq_len // LETTERS_PER_LINE):
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK]))
            handle.write(str((line_number + 1) *
                             LETTERS_PER_LINE).rjust(POSITION_PADDING))
            handle.write("\n")
        if seq_len % LETTERS_PER_LINE:
            # Final (partial) line
            line_number = (seq_len // LETTERS_PER_LINE)
            handle.write("    ")  # Just four, not five
            for block in range(BLOCKS_PER_LINE):
                index = LETTERS_PER_LINE * line_number + \
                    LETTERS_PER_BLOCK * block
                handle.write(
                    (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11))
            handle.write(str(seq_len).rjust(POSITION_PADDING))
            handle.write("\n")
Ejemplo n.º 24
0
    def _guess_consensus_alphabet(self, ambiguous):
        """Pick an (ungapped) alphabet for an alignment consesus sequence.

        This just looks at the sequences we have, checks their type, and
        returns as appropriate type which seems to make sense with the
        sequences we've got.
        """
        # Start with the (un-gapped version of) the alignment alphabet
        a = Alphabet._get_base_alphabet(self.alignment._alphabet)

        # Now check its compatible with all the rest of the sequences
        for record in self.alignment:
            # Get the (un-gapped version of) the sequence's alphabet
            alt = Alphabet._get_base_alphabet(record.seq.alphabet)
            if not isinstance(alt, a.__class__):
                raise ValueError(
                    "Alignment contains a sequence with \
                                an incompatible alphabet."
                )

        # Check the ambiguous character we are going to use in the consensus
        # is in the alphabet's list of valid letters (if defined).
        if hasattr(a, "letters") and a.letters is not None and ambiguous not in a.letters:
            # We'll need to pick a more generic alphabet...
            if isinstance(a, IUPAC.IUPACUnambiguousDNA):
                if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
                    a = IUPAC.IUPACUnambiguousDNA()
                else:
                    a = Alphabet.generic_dna
            elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
                if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
                    a = IUPAC.IUPACUnambiguousRNA()
                else:
                    a = Alphabet.generic_rna
            elif isinstance(a, IUPAC.IUPACProtein):
                if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
                    a = IUPAC.ExtendedIUPACProtein()
                else:
                    a = Alphabet.generic_protein
            else:
                a = Alphabet.single_letter_alphabet
        return a
Ejemplo n.º 25
0
 def extract(self, start_pos, end_pos, make_file=False):
     range_set = set(range(start_pos, end_pos))
     partial_gb = SeqRecord(
         Seq(str(self.gb.seq[start_pos:end_pos]), Alphabet.DNAAlphabet()))
     for afeat in self.gb.features:
         afeat_range = set(range(afeat.location.start, afeat.location.end))
         if len(afeat_range & range_set) > 0:
             partial_gb.features.append(afeat)
     if make_file == True:
         record_handle = open(
             partial_gb.id + "_" + str(start_pos) + "_" + str(end_pos), "w")
         SeqIO.write(partial_gb, record_handle, "genbank")
     return partial_gb
Ejemplo n.º 26
0
 def __init__(self, in_dict, dict_type, alphabet=None):
     self.alphabet = alphabet
     if dict_type == COUNT:
         self.count = in_dict
         self._freq_from_count()
     elif dict_type == FREQ:
         self.count = {}
         self.update(in_dict)
     else:
         raise ValueError("bad dict_type")
     if not alphabet:
         self.alphabet = Alphabet.Alphabet()
         self.alphabet.letters = self._alphabet_from_input()
Ejemplo n.º 27
0
    def __init__(self, data=None, alphabet=None, mat_name="", build_later=0):
        """Initialize.

        User may supply:

        - data: matrix itself
        - mat_name: its name. See below.
        - alphabet: an instance of Bio.Alphabet, or a subclass. If not
          supplied, constructor builds its own from that matrix.
        - build_later: skip the matrix size assertion. User will build the
          matrix after creating the instance. Constructor builds a half matrix
          filled with zeroes.

        """
        assert isinstance(mat_name, str)
        # "data" may be:
        # 1) None --> then self.data is an empty dictionary
        # 2) type({}) --> then self takes the items in data
        # 3) An instance of SeqMat
        # This whole creation-during-execution is done to avoid changing
        # default values, the way Python does because default values are
        # created when the function is defined, not when it is created.
        if data:
            try:
                self.update(data)
            except ValueError:
                raise ValueError("Failed to store data in a dictionary")
        if alphabet is None:
            alphabet = Alphabet.Alphabet()
        assert Alphabet.generic_alphabet.contains(alphabet)
        self.alphabet = alphabet

        # If passed alphabet is empty, use the letters in the matrix itself
        if not self.alphabet.letters:
            self._alphabet_from_matrix()
        # Assert matrix size: half or full
        if not build_later:
            N = len(self.alphabet.letters)
            assert len(self) == N**2 or len(self) == N * (N + 1) / 2
        self.ab_list = list(self.alphabet.letters)
        self.ab_list.sort()
        # Names: a string like "BLOSUM62" or "PAM250"
        self.mat_name = mat_name
        if build_later:
            self._init_zero()
        else:
            # Convert full to half
            self._full_to_half()
            self._correct_matrix()
        self.sum_letters = {}
        self.relative_entropy = 0
Ejemplo n.º 28
0
def conservation(msa_path):
    import numpy as np
    import scipy.stats as sc
    from Bio import AlignIO
    from Bio.Align import AlignInfo
    from Bio.Alphabet import IUPAC
    from Bio.SubsMat import FreqTable
    import Bio.Alphabet as Alphabet
    from Bio import motifs
    for filename in os.listdir(msa_path):
        if filename.endswith(".cluster"):
            alignment = AlignIO.read(msa_path + filename,
                                     "fasta",
                                     alphabet=Alphabet.ProteinAlphabet())
            columns_quantity = []
            columns_frequency = []
            #summary_align = AlignInfo.SummaryInfo(alignment)
            #pssm = summary_align.pos_specific_score_matrix()
            #print pssm
            for x in range(0, len(alignment[0].seq) - 1):
                column = alignment[:, x]
                quantity = letters
                for f in column:
                    print(f)
                    quantity[f] += 1
                double = 20 / len(alignment)
                print len(alignment)
                print(quantity)
                #frequency=list(map(lambda x: x/len(alignment), quantity))
                frequency = dict(
                    map(lambda (k, v): (k, v / len(alignment)),
                        quantity.iteritems()))
                print frequency
                columns_quantity.append(quantity)
                columns_frequency.append(frequency)
            print(columns_quantity)
            '''
            m = motifs.create(alignment,alphabet=Alphabet.ProteinAlphabet())
            print (m)
            
            alfa = summary_align.alignment._alphabet
            base_alpha = Alphabet._get_base_alphabet(alfa) 
            print(summary_align)
            print(alfa)
            print(base_alpha)
            data=summary_align.information_content(5,30)
            print(data)'''

    #n is the number of data points
    ''''n=10
Ejemplo n.º 29
0
    def create_db_from_input(self, input_dir, log_fh=sys.stderr):
        session = self.session

        print("\nLoading data from directory '%s' ..." % input_dir, file=log_fh)
        #species = sorted(next(os.walk(input_dir))[1])
        species = next(os.walk(input_dir))[1]
        print("\nFound %d species:\n\t%s\n" % (len(species), '\n\t'.join(species)), file=log_fh)

        # traverse species folders
        for sp_name in species:
            db_species = Species(name=sp_name)
            session.add(db_species)

            sp_dir = os.path.join(input_dir, sp_name)
            sp_files = glob(os.path.join(sp_dir, '*.fa')) + glob(os.path.join(sp_dir, '*.fasta'))

            # loop through FASTA files
            for fn in sp_files:
                # read sequences
                recs = list(SeqIO.parse(fn, 'fasta', alphabet=Alphabet.Gapped(Alphabet.IUPAC.ambiguous_dna)))
                seqs_ok = True
                # make sure sequences are DNA
                for r in recs:
                    if not Alphabet._verify_alphabet(r.seq.upper()):
                        seqs_ok = False
                        break
                if not seqs_ok:
                    continue

                #oid = re.findall("^\d+", os.path.split(fn)[1])[0]
                oid = os.path.split(fn)[1].split('.')[0]

                db_ortho = self.get_or_create(Ortholog, id=str(oid))
                db_file = File(path=fn)
                db_file.ortholog = db_ortho
                session.add(db_file)

                # make sure sequences are unique
                sequences = set([DnaSeq(r.id, str(r.seq)) for r in SeqIO.parse(open(fn, 'rt'), 'fasta')])
                for seq in sequences:
                    db_seq = Sequence(fasta_id=seq.id, description='', residues=str(seq.dna.upper()))
                    db_seq.species = db_species
                    db_seq.ortholog = db_ortho
                    session.add(db_seq)
                    session.flush()
                    db_seq.description = "id=%d,id_species=%d" % (db_seq.id, db_species.id)

        # save data to database
        session.commit()
Ejemplo n.º 30
0
def complicateSeq(obj):
    if '__Seq__' not in obj:
        raise ValueError, "object must be converable to Bio.Seq"

    # Figure out which alphabet to use
    try:
        alphabet = Alphabet.__getattribute__(obj['alphabet'])()
    except AttributeError:
        pass
    try:
        alphabet = Alphabet.IUPAC.__getattribute__(obj['alphabet'])()
    except AttributeError:
        raise

    seq = Seq(obj['seq'], alphabet=alphabet)
    return seq
Ejemplo n.º 31
0
def complicateSeq(obj):
    if '__Seq__' not in obj:
        raise ValueError, "object must be converable to Bio.Seq"
    
    # Figure out which alphabet to use
    try:
        alphabet = Alphabet.__getattribute__(obj['alphabet'])()
    except AttributeError:
        pass
    try:
        alphabet = Alphabet.IUPAC.__getattribute__(obj['alphabet'])()
    except AttributeError:
        raise
    
    seq = Seq(obj['seq'],alphabet=alphabet)
    return seq
Ejemplo n.º 32
0
 def test_to_alignment(self):
     tree = self.phyloxml.phylogenies[0]
     aln = tree.to_alignment()
     self.assertTrue(isinstance(aln, MultipleSeqAlignment))
     self.assertEqual(len(aln), 0)
     # Add sequences to the terminals
     alphabet = Alphabet.Gapped(Alphabet.generic_dna)
     for tip, seqstr in zip(tree.get_terminals(),
             ('AA--TTA', 'AA--TTG', 'AACCTTC')):
         tip.sequences.append(PX.Sequence.from_seqrecord(
             SeqRecord(Seq(seqstr, alphabet), id=str(tip))))
     # Check the alignment
     aln = tree.to_alignment()
     self.assertTrue(isinstance(aln, MultipleSeqAlignment))
     self.assertEqual(len(aln), 3)
     self.assertEqual(aln.get_alignment_length(), 7)
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha),
                                    sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
Ejemplo n.º 34
0
def process_upload(sequences, format, request):
    if format not in ["file", "text"]:
        raise InvalidFASTA(
            "Invalid format: {}. Must be either 'file' or 'text'.".format(
                format))

    if format == "text":
        seq_file = io.BytesIO()
        seq_file.write(sequences)
        seq_file.seek(0)
        sequences = seq_file

    sequences = SeqIO.parse(sequences, "fasta", IUPAC.ExtendedIUPACProtein())

    try:
        sequence = next(sequences)
    except StopIteration:
        raise InvalidFASTA("No sequences parsed.")

    if not Alphabet._verify_alphabet(sequence.seq):
        raise InvalidFASTA("Sequence {} is not a protein.".format(sequence.id))

    result = [str(sequence.id)]

    classifications, ids, rows = upload_hmmer(sequence)
    result.append(classifications[0][1])
    secondary_classification = classifications[0][2]
    result.append(secondary_classification
                  if secondary_classification != "Unknown" else None)
    result.append(rows)
    result.append(upload_blastp(sequence)[0])
    result.append(result[-1][0]["id"])
    result.append(result[-2][0]["variant"])

    request.session["uploaded_sequences"] = [{
        "id":
        "QUERY",  #sequence.id,
        "variant":
        classifications[0][1],
        "sequence":
        str(sequence.seq),
        "taxonomy":
        result[-3][0]["taxonomy"]
    }]

    return result
Ejemplo n.º 35
0
def count_gaps_and_characters(aln_file, file_format = "fasta"):
    """
    count how many gaps and how many characters there are in an alignemnt
    :param aln_file: input alignment file
    :param file_format: input file format (default: fasta)
    :return: alignment length, number of gap chars, number of non-gap chars
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, file_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    total_gaps = 0
    total_not_gaps = 0
    for record in aln:
        local_gaps = record.seq.count("-")
        local_not_gaps = len(record.seq) - local_gaps
        total_gaps += local_gaps
        total_not_gaps += local_not_gaps
    return len(aln), total_gaps, total_not_gaps
Ejemplo n.º 36
0
def get_major_and_minor_consensus(aln_file, in_format="fasta"):
    """
    calculates major and minor consensus and each position's probability
    - major consensus - the most prominent base (including "-")
    - minor consensus - the most prominent base (not including "-")
    :param aln_file: alignment file path
    :param in_format: input alignment format (default: fasta)
    :return: major_consensus, major_freqs, minor_consensus, minor_freqs
    """
    aln_file = check_filename(aln_file)
    aln = AlignIO.read(aln_file, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    len_aln = len(aln[0])
    num_of_seq = len(aln)
    major_consensus = ""
    major_freqs = []
    minor_consensus = ""
    minor_freqs = []
    for i in range(len_aln):
        counter = collections.Counter(aln[:, i])
        major_count = 0
        minor_count = 0
        major_char = ""
        minor_char = ""
        for j in counter:
            if counter[j] > major_count:
                major_count = counter[j]
                major_char = j
                if j != "-":
                    minor_count = counter[j]
                    minor_char = j
            if counter[j] > minor_count and j != "-":
                if j not in ["A", "C", "G", "T"]:
                    minor_count = counter[j]
                    minor_char = "N"
                else:
                    minor_count = counter[j]
                    minor_char = j
        gap_count = counter["-"]
        major_consensus += major_char
        major_freqs.append(round(major_count / (num_of_seq - gap_count), 2))

        minor_consensus += minor_char
        minor_freqs.append(round(minor_count / (num_of_seq - gap_count), 2))

    return major_consensus, major_freqs, minor_consensus, minor_freqs
Ejemplo n.º 37
0
def format_changer(filename, out_format, outfile= None, in_format="fasta"):
    """
    sequence file format changer
    :param filename: input sequence filename
    :param out_format: output format
    :param outfile: output file (default: None)
    :param in_format: input format (default: fasta)
    :return: out file path in out format
    """
    filename = check_filename(filename)
    if outfile != None:
        outfile = check_filename(outfile, Truefile=False)
    else:
        outfile = path.splitext(filename)[0] + "." + out_format
    alignment = AlignIO.read(filename, in_format, alphabet=Alphabet.Gapped(IUPAC.unambiguous_dna))
    AlignIO.write(alignment, outfile, out_format)
    print("saved %s in format %s" % (outfile, out_format))
    return outfile
Ejemplo n.º 38
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (Bio.Align.Generic)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict)+1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    fssp_align = Generic.Alignment(Alphabet.Gapped(
                                   Alphabet.IUPAC.extended_protein))
    for i in sorted(mult_align_dict):
        fssp_align.add_sequence(sum_dict[i].pdb2+sum_dict[i].chain2,
                                mult_align_dict[i])
#        fssp_align._add_numbering_table()
    return fssp_align
Ejemplo n.º 39
0
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".",1)[1].isdigit():
            version = "SV " + record.id.rsplit(".",1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".",1)[0],
                                                 just_first=True)
        else :
            version = ""
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)
        
        if ";" in accession :
            raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
                             % repr(accession))
        if " " in accession :
            #This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s" \
                             % repr(accession))

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif not isinstance(a, Alphabet.NucleotideAlphabet):
            raise ValueError("Need a Nucleotide alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            #Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA or RNA alphabet")

        #TODO - Full ID line
        handle = self.handle
        self._write_single_line("ID", "%s; %s; ; %s; ; ; %i BP." \
                                % (accession, version, mol_type, len(record)))
        handle.write("XX\n")
        self._write_single_line("AC", accession+";")
        handle.write("XX\n")
Ejemplo n.º 40
0
    def _append(self, record, expected_length=None):
        """Validate and append a record (PRIVATE)."""
        if not isinstance(record, SeqRecord):
            raise TypeError("New sequence is not a SeqRecord object")

        # Currently the get_alignment_length() call is expensive, so we need
        # to avoid calling it repeatedly for __init__ and extend, hence this
        # private _append method
        if expected_length is not None and len(record) != expected_length:
            # TODO - Use the following more helpful error, but update unit tests
            # raise ValueError("New sequence is not of length %i" \
            #                 % self.get_alignment_length())
            raise ValueError("Sequences must all be the same length")

        # Using not self.alphabet.contains(record.seq.alphabet) needs fixing
        # for AlphabetEncoders (e.g. gapped versus ungapped).
        if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]):
            raise ValueError("New sequence's alphabet is incompatible")
        self._records.append(record)
Ejemplo n.º 41
0
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)

        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            return "protein"
        elif isinstance(a, Alphabet.DNAAlphabet):
            return "dna"
        elif isinstance(a, Alphabet.RNAAlphabet):
            return "rna"
        else:
            #Must be something like NucleotideAlphabet or
            #just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
Ejemplo n.º 42
0
 def _write_references(self, record):
     number = 0
     for ref in record.annotations["references"]:
         if not isinstance(ref, SeqFeature.Reference):
             continue
         number += 1
         data = str(number)
         # TODO - support more complex record reference locations?
         if ref.location and len(ref.location) == 1:
             a = Alphabet._get_base_alphabet(record.seq.alphabet)
             if isinstance(a, Alphabet.ProteinAlphabet):
                 units = "residues"
             else:
                 units = "bases"
             data += "  (%s %i to %i)" % (units,
                                          ref.location[0].nofuzzy_start + 1,
                                          ref.location[0].nofuzzy_end)
         self._write_single_line("REFERENCE", data)
         if ref.authors:
             # We store the AUTHORS data as a single string
             self._write_multi_line("  AUTHORS", ref.authors)
         if ref.consrtm:
             # We store the consortium as a single string
             self._write_multi_line("  CONSRTM", ref.consrtm)
         if ref.title:
             # We store the title as a single string
             self._write_multi_line("  TITLE", ref.title)
         if ref.journal:
             # We store this as a single string - holds the journal name,
             # volume, year, and page numbers of the citation
             self._write_multi_line("  JOURNAL", ref.journal)
         if ref.medline_id:
             # This line type is obsolete and was removed from the GenBank
             # flatfile format in April 2005. Should we write it?
             # Note this has a two space indent:
             self._write_multi_line("  MEDLINE", ref.medline_id)
         if ref.pubmed_id:
             # Note this has a THREE space indent:
             self._write_multi_line("   PUBMED", ref.pubmed_id)
         if ref.comment:
             self._write_multi_line("  REMARK", ref.comment)
Ejemplo n.º 43
0
def process_upload(sequences, format, request):
    if format not in ["file", "text"]:
        raise InvalidFASTA("Invalid format: {}. Must be either 'file' or 'text'.".format(format))

    if format == "text":
        seq_file = StringIO.StringIO()
        seq_file.write(sequences)
        seq_file.seek(0)
        sequences = seq_file

    sequences = SeqIO.parse(sequences, "fasta", IUPAC.ExtendedIUPACProtein())

    try:
        sequence = sequences.next()
    except StopIteration:
        raise InvalidFASTA("No sequences parsed.")

    if not Alphabet._verify_alphabet(sequence.seq):
        raise InvalidFASTA("Sequence {} is not a protein.".format(sequence.id))

    result = [str(sequence.id)]

    classifications, ids, rows = upload_hmmer(sequence)
    result.append(classifications[0][1])
    secondary_classification = classifications[0][2]
    result.append(secondary_classification if secondary_classification != "Unknown" else None)
    result.append(rows)
    result.append(upload_blastp(sequence)[0])
    result.append(result[-1][0]["id"])
    result.append(result[-2][0]["variant"])

    request.session["uploaded_sequences"] = [{
        "id":"QUERY", #sequence.id,
        "variant":classifications[0][1],
        "sequence":str(sequence.seq),
        "taxonomy":result[-3][0]["taxonomy"]
    }]

    return result
Ejemplo n.º 44
0
    def from_seqfile(cls, seqfile, fileformat):
        """
        Create a BioSeqs object retrieving all the information stored at the
        sequence file provided. If 'seqfile' contains a relative path, the
        current working directory will be used to get the absolute path.
        
        Arguments :
            seqfile  ( string )
                Input sequences file.
            fileformat  ( string ) 
                Input file format.

        Raises :
            IOError
                If the path or the file provided doesn't exist.

        * The file format must be supported by Bio.SeqIO.
        * If the file format provided doesn't correspond to the actual file
        format, an empty sequence dictionary will be created.
        """
        filepath = get_abspath(seqfile)
        # Read the sequence file and create a new BioSeqs object, generating a
        # new report list
        seq_dict = {}
        for record in SeqIO.parse(filepath, fileformat):
            # When reading or parsing from certain sequence file format
            # (e.g. FASTA), Bio.SeqIO gives a default alphabet to the Seq object
            # created that will raise an error when writing it in a GENBANK
            # file. Thus, we change that alphabet to a more specific one,
            # checking if it is a DNA or a protein sequence
            if isinstance(record.seq.alphabet, Alphabet.SingleLetterAlphabet):
                record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACDNA()
                if not Alphabet._verify_alphabet(record.seq):
                    record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACProtein()
            seq_dict[record.id] = record
        date_time = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
        report = [(date_time, "local", filepath, fileformat)]
        return cls(seq_dict, report)
Ejemplo n.º 45
0
    def _classify_alphabet_for_nexus(self, alphabet):
        """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE).

        Raises an exception if this is not possible."""
        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(alphabet)


        """condition loop below was edited by Ambuj Kumar in order to make 
            it align with ConCat"""


        if 'Alphabet.Alphabet' not in str(type(a)) and 'Alphabet.ProteinAlphabet' not in str(type(a)) and 'Alphabet.DNAAlphabet' not in str(type(a)) and 'Alphabet.RNAAlphabet' not in str(type(a)) and 'Alphabet.Gapped' not in str(type(a)):
            raise TypeError("Invalid alphabet")
        elif 'Protein' in str(type(a)):
            return "protein"
        elif 'DNA' in str(type(a)):
            return "dna"
        elif 'RNA' in str(type(a)):
            return "rna"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")
Ejemplo n.º 46
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format."""
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        return
    if marker != b"ABIF":
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    raw = dict()
    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        key = tag_name + str(tag_number)

        raw[key] = tag_data

        # PBAS2 is base-called sequence, only available in 3530
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # raw data (for advanced end users benefit)
    annot['abif_raw'] = raw

    # fsa check
    is_fsa_file = all([tn not in raw for tn in ('PBAS1', 'PBAS2')])

    if is_fsa_file:
        try:
            file_name = basename(handle.name).replace('.fsa', '')
        except AttributeError:
            file_name = ""
        sample_id = raw.get('LIMS1', '<unknown id>')
        description = raw.get('CTID1', '<unknown description>')
        record = SeqRecord(Seq(''),
                           id=sample_id,
                           name=file_name,
                           description=description,
                           annotations=annot)

    else:
        # use the file name as SeqRecord.name if available
        try:
            file_name = basename(handle.name).replace('.ab1', '')
        except AttributeError:
            file_name = ""
        record = SeqRecord(Seq(seq, alphabet),
                           id=sample_id, name=file_name,
                           description='',
                           annotations=annot,
                           letter_annotations={'phred_quality': qual})

    if not trim or is_fsa_file:
        yield record
    else:
        yield _abi_trim(record)
Ejemplo n.º 47
0
    def __init__(self, records, alphabet=None,
                 annotations=None, column_annotations=None):
        """Initialize a new MultipleSeqAlignment object.

        Arguments:
         - records - A list (or iterator) of SeqRecord objects, whose
                     sequences are all the same length.  This may be an be an
                     empty list.
         - alphabet - The alphabet for the whole alignment, typically a gapped
                      alphabet, which should be a super-set of the individual
                      record alphabets.  If omitted, a consensus alphabet is
                      used.
         - annotations - Information about the whole alignment (dictionary).
         - column_annotations - Per column annotation (restricted dictionary).
                      This holds Python sequences (lists, strings, tuples)
                      whose length matches the number of columns. A typical
                      use would be a secondary structure consensus string.

        You would normally load a MSA from a file using Bio.AlignIO, but you
        can do this from a list of SeqRecord objects too:

        >>> from Bio.Alphabet import generic_dna
        >>> from Bio.Seq import Seq
        >>> from Bio.SeqRecord import SeqRecord
        >>> from Bio.Align import MultipleSeqAlignment
        >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
        >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
        >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
        >>> align = MultipleSeqAlignment([a, b, c],
        ...                              annotations={"tool": "demo"},
        ...                              column_annotations={"stats": "CCCXCCC"})
        >>> print(align)
        DNAAlphabet() alignment with 3 rows and 7 columns
        AAAACGT Alpha
        AAA-CGT Beta
        AAAAGGT Gamma
        >>> align.annotations
        {'tool': 'demo'}
        >>> align.column_annotations
        {'stats': 'CCCXCCC'}
        """
        if alphabet is not None:
            if not isinstance(alphabet, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)):
                raise ValueError("Invalid alphabet argument")
            self._alphabet = alphabet
        else:
            # Default while we add sequences, will take a consensus later
            self._alphabet = Alphabet.single_letter_alphabet

        self._records = []
        if records:
            self.extend(records)
            if alphabet is None:
                # No alphabet was given, take a consensus alphabet
                self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for
                                                              rec in self._records
                                                              if rec.seq is not None)

        # Annotations about the whole alignment
        if annotations is None:
            annotations = {}
        elif not isinstance(annotations, dict):
            raise TypeError("annotations argument should be a dict")
        self.annotations = annotations

        # Annotations about each colum of the alignment
        if column_annotations is None:
            column_annotations = {}
        # Handle this via the property set function which will validate it
        self.column_annotations = column_annotations
Ejemplo n.º 48
0
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(
                record, "accession", just_first=True)
        if len(locus) > 16:
            if len(locus) + 1 + len(str(len(record))) > 28:
                # Locus name and record length to long to squeeze in.
                raise ValueError("Locus identifier %r is too long" % locus)
            else:
                warnings.warn("Stealing space from length field to allow long name in LOCUS line", BiopythonWarning)
        if len(locus.split()) > 1:
            # locus could be unicode, and u'with space' versus 'with space'
            # causes trouble with doctest or print-and-compare tests, so
            tmp = repr(locus)
            if tmp.startswith("u'") and tmp.endswith("'"):
                tmp = tmp[1:]
            raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp)
        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "aa"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "bp"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        topology = self._get_topology(record)

        division = self._get_data_division(record)

        name_length = str(len(record)).rjust(28)
        name_length = locus + name_length[len(locus):]
        assert len(name_length) == 28, name_length
        assert " " in name_length, name_length

        assert len(units) == 2
        assert len(division) == 3
        line = "LOCUS       %s %s    %s %s %s %s\n" \
            % (name_length,
               units,
               mol_type.ljust(7),
               topology,
               division,
               self._get_date(record))
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        # We're bending the rules to allow an identifier over 16 characters
        # if we can steal spaces from the length field:
        # assert line[12:28].rstrip() == locus, \
        #     'LOCUS line does not contain the locus at the expected position:\n' + line
        # assert line[28:29] == " "
        # assert line[29:40].lstrip() == str(len(record)), \
        #     'LOCUS line does not contain the length at the expected position:\n' + line
        assert line[12:40].split() == [locus, str(len(record))], line

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [' bp ', ' aa '], \
            'LOCUS line does not contain size units at expected position:\n' + \
            line
        assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
            'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
        assert line[47:54].strip() == "" \
            or 'DNA' in line[47:54].strip() \
            or 'RNA' in line[47:54].strip(), \
               'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
        assert line[54:55] == ' ', \
            'LOCUS line does not contain space at position 55:\n' + line
        assert line[55:63].strip() in ['', 'linear', 'circular'], \
            'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
        assert line[63:64] == ' ', \
            'LOCUS line does not contain space at position 64:\n' + line
        assert line[67:68] == ' ', \
            'LOCUS line does not contain space at position 68:\n' + line
        assert line[70:71] == '-', \
            'LOCUS line does not contain - at position 71 in date:\n' + line
        assert line[74:75] == '-', \
            'LOCUS line does not contain - at position 75 in date:\n' + line

        self.handle.write(line)
Ejemplo n.º 49
0
    # Check Bio.SeqIO.read(...)
    if t_count == 1:
        record = SeqIO.read(t_filename, format=t_format)
        assert isinstance(record, SeqRecord)
    else:
        try:
            record = SeqIO.read(t_filename, t_format)
            assert False, "Bio.SeqIO.read(...) should have failed"
        except ValueError:
            # Expected to fail
            pass

    # Check alphabets
    for record in records:
        base_alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
        if isinstance(base_alpha, Alphabet.SingleLetterAlphabet):
            if t_format in no_alpha_formats:
                # Too harsh?
                assert base_alpha == Alphabet.single_letter_alphabet
        else:
            base_alpha = None
    if base_alpha is None:
        good = []
        bad = []
        given_alpha = None
    elif isinstance(base_alpha, Alphabet.ProteinAlphabet):
        good = protein_alphas
        bad = dna_alphas + rna_alphas + nucleotide_alphas
    elif isinstance(base_alpha, Alphabet.RNAAlphabet):
        good = nucleotide_alphas + rna_alphas
Ejemplo n.º 50
0
def concatenate(alignments, padding_length=0, partitions=None):

    '''
    Concatenate alignments based on the Seq ids; row order does not
    matter. If one alignment contains a Seq id that another one does
    not, gaps will be introduced in place of the missing Seq.

    Args:
        alignments: (tuple, list) Alignments to be concatenated.

        padding_length: Introduce this many gaps between concatenated
            alignments.
    '''

    from Bio import Alphabet
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Align import MultipleSeqAlignment
    if not isinstance(alignments, (list, tuple)):
        raise ValueError('Argument must be a list or a tuple.')
    elif len(alignments) == 1:
        return alignments[0]
    if isinstance(alignments, tuple):
        alignments = list(alignments)
    aln1 = None
    aln2 = None
    if len(alignments) > 2:
        aln2 = alignments.pop()
        result1 = concatenate(alignments=alignments,
                              padding_length=padding_length,
                              partitions=partitions)
        aln1 = result1[0]
        partitions = result1[1]
    elif len(alignments) == 2:
        aln1 = alignments[0]
        aln2 = alignments[1]
    if (not isinstance(aln1, MultipleSeqAlignment) or
            not isinstance(aln2, MultipleSeqAlignment)):
        raise ValueError(
            'Argument must inherit from Bio.Align.MultipleSeqAlignment.')
    alphabet = Alphabet._consensus_alphabet([aln1._alphabet, aln2._alphabet])
    aln1_dict = dict()
    aln2_dict = dict()
    for aln1_s in aln1:
        aln1_dict[aln1_s.id] = aln1_s
    for aln2_s in aln2:
        aln2_dict[aln2_s.id] = aln2_s
    aln1_length = aln1.get_alignment_length()
    aln2_length = aln2.get_alignment_length()
    aln1_gaps = SeqRecord(Seq('-' * aln1_length, alphabet))
    aln2_gaps = SeqRecord(Seq('-' * aln2_length, alphabet))
    padding = SeqRecord(Seq('N' * padding_length, alphabet))

    if not partitions:
        partitions = [(1, aln1_length)]
    partitions.append((1 + aln1_length, padding_length + aln1_length + aln2_length))

    result_seq_list = list()
    for aln1_key in aln1_dict.keys():
        merged_Seq = None
        if aln1_key in aln2_dict:
            merged_Seq = aln1_dict[aln1_key] + padding + aln2_dict[aln1_key]
            merged_Seq.id = aln1_dict[aln1_key].id
            merged_Seq.name = ''
            merged_Seq.description = ''
            aln2_dict.pop(aln1_key)
        else:
            aln1_seq_record = aln1_dict[aln1_key]
            merged_Seq = aln1_seq_record + padding + aln2_gaps
            merged_Seq.id = aln1_seq_record.id
            merged_Seq.name = ''
            merged_Seq.description = ''
        result_seq_list.append(merged_Seq)
    for aln2_seq_record in aln2_dict.values():
        merged_Seq = aln1_gaps + padding + aln2_seq_record
        merged_Seq.id = aln2_seq_record.id
        merged_Seq.name = ''
        merged_Seq.description = ''
        result_seq_list.append(merged_Seq)
    result_alignment = MultipleSeqAlignment(result_seq_list, alphabet)
    result_alignment.sort()
    return((result_alignment, partitions))
Ejemplo n.º 51
0
    def _write_the_first_lines(self, record):
        """Write the ID and AC lines."""
        if "." in record.id and record.id.rsplit(".", 1)[1].isdigit():
            version = "SV " + record.id.rsplit(".", 1)[1]
            accession = self._get_annotation_str(record, "accession",
                                                 record.id.rsplit(".", 1)[0],
                                                 just_first=True)
        else :
            version = "XXX"
            accession = self._get_annotation_str(record, "accession",
                                                 record.id,
                                                 just_first=True)
        
        if ";" in accession :
            raise ValueError("Cannot have semi-colon in EMBL accession, %s" \
                             % repr(accession))
        if " " in accession :
            #This is out of practicallity... might it be allowed?
            raise ValueError("Cannot have spaces in EMBL accession, %s" \
                             % repr(accession))

        #Get the molecule type
        #TODO - record this explicitly in the parser?
        #Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif not isinstance(a, Alphabet.NucleotideAlphabet):
            raise ValueError("Need a Nucleotide alphabet")
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            #Must be something like NucleotideAlphabet
            raise ValueError("Need a DNA or RNA alphabet")

        #Get the topology -- circular or linear
        if 'topology' in record.annotations:
            topology = record.annotations['topology']
            if topology not in ['linear', 'circular']:
                raise ValueError("Cannot have '%s' for topology in EMBL ID line, must be 'circular' or 'linear'" % topology)
        else:
            topology = 'linear' # default topology

        #Get the taxonomy division
        division = self._get_data_division(record)
        
        #Get Data class
        data_class = self._get_data_class(record)

        #Full ID line
        #ID   <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP.
        #1. Primary accession number
        #2. Sequence version number
        #3. Topology: 'circular' or 'linear'
        #4. Molecule type (see note 1 below)
        #5. Data class (see section 3.1)
        #6. Taxonomic division (see section 3.2)
        #7. Sequence length (see note 2 below)
        #All tokens that are non-mandatory can be represented by a universal placeholder "XXX", 
        #so in the ID line in the new submission can look as follows:
        #ID   XXX; XXX; linear; XXX; XXX; XXX; 500 BP.
        handle = self.handle
        self._write_single_line("ID", "%s; %s; %s; %s; %s; %s; %i BP." \
                                % (accession, version, topology, mol_type, data_class, division, len(record)))
        handle.write("XX\n")
        self._write_single_line("AC", accession+";")
        handle.write("XX\n")
Ejemplo n.º 52
0
            assert (isinstance(a,str) or isinstance(b,str)), \
                   "Nucleotide+Protein addition should fail!"
        except TypeError :
            pass

###########################################################################
print
print "Testing Seq string methods"
print "=========================="
for a in dna + rna + nuc + protein :
    if not isinstance(a, Seq.Seq) : continue
    assert a.strip().tostring() == a.tostring().strip()
    assert a.lstrip().tostring() == a.tostring().lstrip()
    assert a.rstrip().tostring() == a.tostring().rstrip()
    test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
    alpha = Alphabet._get_base_alphabet(a.alphabet)
    if isinstance(alpha, Alphabet.DNAAlphabet) :
        test_chars.append(Seq.Seq("A", IUPAC.ambiguous_dna))
    if isinstance(alpha, Alphabet.RNAAlphabet) :
        test_chars.append(Seq.Seq("A", IUPAC.ambiguous_rna))
    if isinstance(alpha, Alphabet.NucleotideAlphabet) :
        test_chars.append(Seq.Seq("A", Alphabet.generic_nucleotide))
    if isinstance(alpha, Alphabet.ProteinAlphabet) :
        test_chars.append(Seq.Seq("K", Alphabet.generic_protein))
        test_chars.append(Seq.Seq("K-", Alphabet.Gapped(Alphabet.generic_protein,"-")))
        test_chars.append(Seq.Seq("K@", Alphabet.Gapped(IUPAC.protein,"@")))
        #Setup a clashing alphabet sequence
        b = Seq.Seq("-", Alphabet.generic_nucleotide)
    else :
        b = Seq.Seq("-", Alphabet.generic_protein)
    try :
Ejemplo n.º 53
0
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(
                record, "accession", just_first=True)
        if len(locus) > 16:
            raise ValueError("Locus identifier %r is too long" % str(locus))

        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "aa"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "bp"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        division = self._get_data_division(record)

        assert len(units) == 2
        assert len(division) == 3
        # TODO - date
        # TODO - mol_type
        line = "LOCUS       %s %s %s    %s           %s %s\n" \
            % (locus.ljust(16),
               str(len(record)).rjust(11),
               units,
               mol_type.ljust(6),
               division,
               self._get_date(record))
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        assert line[12:28].rstrip() == locus, \
            'LOCUS line does not contain the locus at the expected position:\n' + line
        assert line[28:29] == " "
        assert line[29:40].lstrip() == str(len(record)), \
            'LOCUS line does not contain the length at the expected position:\n' + line

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [' bp ', ' aa '], \
            'LOCUS line does not contain size units at expected position:\n' + \
            line
        assert line[44:47] in ['   ', 'ss-', 'ds-', 'ms-'], \
            'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
        assert line[47:54].strip() == "" \
            or 'DNA' in line[47:54].strip() \
            or 'RNA' in line[47:54].strip(), \
               'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
        assert line[54:55] == ' ', \
            'LOCUS line does not contain space at position 55:\n' + line
        assert line[55:63].strip() in ['', 'linear', 'circular'], \
            'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
        assert line[63:64] == ' ', \
            'LOCUS line does not contain space at position 64:\n' + line
        assert line[67:68] == ' ', \
            'LOCUS line does not contain space at position 68:\n' + line
        assert line[70:71] == '-', \
            'LOCUS line does not contain - at position 71 in date:\n' + line
        assert line[74:75] == '-', \
            'LOCUS line does not contain - at position 75 in date:\n' + line

        self.handle.write(line)
Ejemplo n.º 54
0
    def _write_the_first_line(self, record):
        """Write the LOCUS line."""

        locus = record.name
        if not locus or locus == "<unknown name>":
            locus = record.id
        if not locus or locus == "<unknown id>":
            locus = self._get_annotation_str(record, "accession", just_first=True)
        if len(locus) > 16:
            raise ValueError("Locus identifier %s is too long" % repr(locus))

        if len(record) > 99999999999:
            # Currently GenBank only officially support up to 350000, but
            # the length field can take eleven digits
            raise ValueError("Sequence too long!")

        # Get the base alphabet (underneath any Gapped or StopCodon encoding)
        a = Alphabet._get_base_alphabet(record.seq.alphabet)
        if not isinstance(a, Alphabet.Alphabet):
            raise TypeError("Invalid alphabet")
        elif isinstance(a, Alphabet.ProteinAlphabet):
            units = "bp"
        elif isinstance(a, Alphabet.NucleotideAlphabet):
            units = "aa"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a Nucleotide or Protein alphabet")

        # Get the molecule type
        # TODO - record this explicitly in the parser?
        if isinstance(a, Alphabet.ProteinAlphabet):
            mol_type = ""
        elif isinstance(a, Alphabet.DNAAlphabet):
            mol_type = "DNA"
        elif isinstance(a, Alphabet.RNAAlphabet):
            mol_type = "RNA"
        else:
            # Must be something like NucleotideAlphabet or
            # just the generic Alphabet (default for fasta files)
            raise ValueError("Need a DNA, RNA or Protein alphabet")

        try:
            division = record.annotations["data_file_division"]
        except KeyError:
            division = "UNK"
        if division not in [
            "PRI",
            "ROD",
            "MAM",
            "VRT",
            "INV",
            "PLN",
            "BCT",
            "VRL",
            "PHG",
            "SYN",
            "UNA",
            "EST",
            "PAT",
            "STS",
            "GSS",
            "HTG",
            "HTC",
            "ENV",
        ]:
            division = "UNK"

        assert len(units) == 2
        assert len(division) == 3
        # TODO - date
        # TODO - mol_type
        line = "LOCUS       %s %s %s    %s           %s 01-JAN-1980\n" % (
            locus.ljust(16),
            str(len(record)).rjust(11),
            units,
            mol_type.ljust(6),
            division,
        )
        assert len(line) == 79 + 1, repr(line)  # plus one for new line

        assert line[12:28].rstrip() == locus, "LOCUS line does not contain the locus at the expected position:\n" + line
        assert line[28:29] == " "
        assert line[29:40].lstrip() == str(len(record)), (
            "LOCUS line does not contain the length at the expected position:\n" + line
        )

        # Tests copied from Bio.GenBank.Scanner
        assert line[40:44] in [" bp ", " aa "], "LOCUS line does not contain size units at expected position:\n" + line
        assert line[44:47] in ["   ", "ss-", "ds-", "ms-"], (
            "LOCUS line does not have valid strand type (Single stranded, ...):\n" + line
        )
        assert (
            line[47:54].strip() == "" or line[47:54].strip().find("DNA") != -1 or line[47:54].strip().find("RNA") != -1
        ), ("LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n" + line)
        assert line[54:55] == " ", "LOCUS line does not contain space at position 55:\n" + line
        assert line[55:63].strip() in ["", "linear", "circular"], (
            "LOCUS line does not contain valid entry (linear, circular, ...):\n" + line
        )
        assert line[63:64] == " ", "LOCUS line does not contain space at position 64:\n" + line
        assert line[67:68] == " ", "LOCUS line does not contain space at position 68:\n" + line
        assert line[70:71] == "-", "LOCUS line does not contain - at position 71 in date:\n" + line
        assert line[74:75] == "-", "LOCUS line does not contain - at position 75 in date:\n" + line

        self.handle.write(line)
Ejemplo n.º 55
0
    def __add__(self, other):
        """Combine two alignments with the same number of rows by adding them.

        If you have two multiple sequence alignments (MSAs), there are two ways to think
        about adding them - by row or by column. Using the extend method adds by row.
        Using the addition operator adds by column. For example,

        >>> from Bio.Alphabet import generic_dna
        >>> from Bio.Seq import Seq
        >>> from Bio.SeqRecord import SeqRecord
        >>> from Bio.Align import MultipleSeqAlignment
        >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha")
        >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta")
        >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma")
        >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha")
        >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta")
        >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma")
        >>> left = MultipleSeqAlignment([a1, b1, c1],
        ...                             annotations={"tool": "demo", "name": "start"})
        >>> right = MultipleSeqAlignment([a2, b2, c2],
        ...                             annotations={"tool": "demo", "name": "end"})

        Now, let's look at these two alignments:

        >>> print(left)
        DNAAlphabet() alignment with 3 rows and 5 columns
        AAAAC Alpha
        AAA-C Beta
        AAAAG Gamma
        >>> print(right)
        DNAAlphabet() alignment with 3 rows and 2 columns
        GT Alpha
        GT Beta
        GT Gamma

        And add them:

        >>> combined = left + right
        >>> print(combined)
        DNAAlphabet() alignment with 3 rows and 7 columns
        AAAACGT Alpha
        AAA-CGT Beta
        AAAAGGT Gamma

        For this to work, both alignments must have the same number of records (here
        they both have 3 rows):

        >>> len(left)
        3
        >>> len(right)
        3
        >>> len(combined)
        3

        The individual rows are SeqRecord objects, and these can be added together. Refer
        to the SeqRecord documentation for details of how the annotation is handled. This
        example is a special case in that both original alignments shared the same names,
        meaning when the rows are added they also get the same name.

        Any common annotations are preserved, but differing annotation is lost. This is
        the same behaviour used in the SeqRecord annotations and is designed to prevent
        accidental propagation of inappropriate values:

        >>> combined.annotations
        {'tool': 'demo'}

        """
        if not isinstance(other, MultipleSeqAlignment):
            raise NotImplementedError
        if len(self) != len(other):
            raise ValueError("When adding two alignments they must have the same length"
                             " (i.e. same number or rows)")
        alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet])
        merged = (left + right for left, right in zip(self, other))
        # Take any common annotation:
        annotations = dict()
        for k, v in self.annotations.items():
            if k in other.annotations and other.annotations[k] == v:
                annotations[k] = v
        return MultipleSeqAlignment(merged, alpha, annotations)
Ejemplo n.º 56
0
    def __init__(self, records, alphabet=None,
                 annotations=None):
        """Initialize a new MultipleSeqAlignment object.

        Arguments:
         - records - A list (or iterator) of SeqRecord objects, whose
                     sequences are all the same length.  This may be an be an
                     empty list.
         - alphabet - The alphabet for the whole alignment, typically a gapped
                      alphabet, which should be a super-set of the individual
                      record alphabets.  If omitted, a consensus alphabet is
                      used.
         - annotations - Information about the whole alignment (dictionary).

        You would normally load a MSA from a file using Bio.AlignIO, but you
        can do this from a list of SeqRecord objects too:

        >>> from Bio.Alphabet import generic_dna
        >>> from Bio.Seq import Seq
        >>> from Bio.SeqRecord import SeqRecord
        >>> from Bio.Align import MultipleSeqAlignment
        >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
        >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
        >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
        >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"})
        >>> print(align)
        DNAAlphabet() alignment with 3 rows and 7 columns
        AAAACGT Alpha
        AAA-CGT Beta
        AAAAGGT Gamma
        >>> align.annotations
        {'tool': 'demo'}

        NOTE - The older Bio.Align.Generic.Alignment class only accepted a
        single argument, an alphabet.  This is still supported via a backwards
        compatible "hack" so as not to disrupt existing scripts and users, but
        is deprecated and will be removed in a future release.
        """
        if isinstance(records, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)):
            if alphabet is None:
                # TODO - Remove this backwards compatible mode!
                alphabet = records
                records = []
                import warnings
                from Bio import BiopythonDeprecationWarning
                warnings.warn("Invalid records argument: While the old "
                              "Bio.Align.Generic.Alignment class only "
                              "accepted a single argument (the alphabet), the "
                              "newer Bio.Align.MultipleSeqAlignment class "
                              "expects a list/iterator of SeqRecord objects "
                              "(which can be an empty list) and an optional "
                              "alphabet argument", BiopythonDeprecationWarning)
            else:
                raise ValueError("Invalid records argument")
        if alphabet is not None:
            if not isinstance(alphabet, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)):
                raise ValueError("Invalid alphabet argument")
            self._alphabet = alphabet
        else:
            # Default while we add sequences, will take a consensus later
            self._alphabet = Alphabet.single_letter_alphabet

        self._records = []
        if records:
            self.extend(records)
            if alphabet is None:
                # No alphabet was given, take a consensus alphabet
                self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for
                                                              rec in self._records
                                                              if rec.seq is not None)

        # Annotations about the whole alignment
        if annotations is None:
            annotations = {}
        elif not isinstance(annotations, dict):
            raise TypeError("annotations argument should be a dict")
        self.annotations = annotations
Ejemplo n.º 57
0
    def __add__(self, other):
        """Combines to alignments with the same number of rows by adding them.

        If you have two multiple sequence alignments (MSAs), there are two ways to think
        about adding them - by row or by column. Using the extend method adds by row.
        Using the addition operator adds by column. For example,

        >>> from Bio.Alphabet import generic_dna
        >>> from Bio.Seq import Seq
        >>> from Bio.SeqRecord import SeqRecord
        >>> from Bio.Align import MultipleSeqAlignment
        >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha")
        >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta")
        >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma")
        >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha")
        >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta")
        >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma")
        >>> left = MultipleSeqAlignment([a1, b1, c1])
        >>> right = MultipleSeqAlignment([a2, b2, c2])

        Now, let's look at these two alignments:

        >>> print left
        DNAAlphabet() alignment with 3 rows and 5 columns
        AAAAC Alpha
        AAA-C Beta
        AAAAG Gamma
        >>> print right
        DNAAlphabet() alignment with 3 rows and 2 columns
        GT Alpha
        GT Beta
        GT Gamma

        And add them:

        >>> print left + right
        DNAAlphabet() alignment with 3 rows and 7 columns
        AAAACGT Alpha
        AAA-CGT Beta
        AAAAGGT Gamma

        For this to work, both alignments must have the same number of records (here
        they both have 3 rows):

        >>> len(left)
        3
        >>> len(right)
        3

        The individual rows are SeqRecord objects, and these can be added together. Refer
        to the SeqRecord documentation for details of how the annotation is handled. This
        example is a special case in that both original alignments shared the same names,
        meaning when the rows are added they also get the same name.
        """
        if not isinstance(other, MultipleSeqAlignment):
            raise NotImplementedError
        if len(self) != len(other):
            raise ValueError("When adding two alignments they must have the same length"
                             " (i.e. same number or rows)")
        alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet])
        merged = (left+right for left,right in zip(self, other))
        return MultipleSeqAlignment(merged, alpha)
Ejemplo n.º 58
0
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False,
                     monoisotopic=False):
    """Calculates the molecular weight of a DNA, RNA or protein sequence.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

        - seq: String or Biopython sequence object.
        - seq_type: The default (None) is to take the alphabet from the seq argument,
          or assume DNA if the seq argument is a string. Override this with
          a string 'DNA', 'RNA', or 'protein'.
        - double_stranded: Calculate the mass for the double stranded molecule?
        - circular: Is the molecule circular (has no ends)?
        - monoisotopic: Use the monoisotopic mass tables?

    Note that for backwards compatibility, if the seq argument is a string,
    or Seq object with a generic alphabet, and no seq_type is specified
    (i.e. left as None), then DNA is assumed.

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    Or, with the sequence alphabet:

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna)))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna)))
    997.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein)))
    249.29

    Also note that contradictory sequence alphabets and seq_type will also
    give an exception:

    >>> from Bio.Seq import Seq
    >>> from Bio.Alphabet import generic_dna
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA"))
    Traceback (most recent call last):
      ...
    ValueError: seq_type='RNA' contradicts DNA from seq alphabet

    """
    # Rewritten by Markus Piotrowski, 2014

    # Find the alphabet type
    tmp_type = ''
    if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
        base_alphabet = Alphabet._get_base_alphabet(seq.alphabet)
        if isinstance(base_alphabet, Alphabet.DNAAlphabet):
            tmp_type = 'DNA'
        elif isinstance(base_alphabet, Alphabet.RNAAlphabet):
            tmp_type = 'RNA'
        elif isinstance(base_alphabet, Alphabet.ProteinAlphabet):
            tmp_type = 'protein'
        elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein):
            tmp_type = 'protein'
            # Convert to one-letter sequence. Have to use a string for seq1
            seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet())
        elif not isinstance(base_alphabet, Alphabet.Alphabet):
            raise TypeError("%s is not a valid alphabet for mass calculations"
                             % base_alphabet)
        else:
            tmp_type = "DNA" # backward compatibity
        if seq_type and tmp_type and tmp_type != seq_type:
            raise ValueError("seq_type=%r contradicts %s from seq alphabet"
                             % (seq_type, tmp_type))
        seq_type = tmp_type
    elif isinstance(seq, str):
        if seq_type is None:
            seq_type = "DNA" # backward compatibity
    else:
        raise TypeError("Expected a string or Seq object, not seq=%r" % seq)

    seq = ''.join(str(seq).split()).upper() # Do the minimum formatting

    if seq_type == 'DNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
        else:
            weight_table = IUPACData.unambiguous_dna_weights
    elif seq_type == 'RNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
        else:
            weight_table = IUPACData.unambiguous_rna_weights
    elif seq_type == 'protein':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_protein_weights
        else:
            weight_table = IUPACData.protein_weights
    else:
        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r"
                         % seq_type)

    if monoisotopic:
        water = 18.010565
    else:
        water = 18.0153

    try:
        weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    except KeyError as e:
        raise ValueError('%s is not a valid unambiguous letter for %s'
                         %(e, seq_type))
    except:
        raise

    if seq_type in ('DNA', 'RNA') and double_stranded:
        seq = str(Seq(seq).complement())
        weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    elif seq_type == 'protein' and double_stranded:
        raise ValueError('double-stranded proteins await their discovery')

    return weight
Ejemplo n.º 59
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id, name=file_name,
                       description='',
                       annotations=annot,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
Ejemplo n.º 60
0
    def information_content(self, start=0,
                            end=None,
                            e_freq_table=None, log_base=2,
                            chars_to_ignore=[]):
        """Calculate the information content for each residue along an alignment.

        Arguments:
            - start, end - The starting an ending points to calculate the
              information content. These points should be relative to the first
              sequence in the alignment, starting at zero (ie. even if the 'real'
              first position in the seq is 203 in the initial sequence, for
              the info content, we need to use zero). This defaults to the entire
              length of the first sequence.
            - e_freq_table - A FreqTable object specifying the expected frequencies
              for each letter in the alphabet we are using (e.g. {'G' : 0.4,
              'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
              included, since these should not have expected frequencies.
            - log_base - The base of the logathrim to use in calculating the
              information content. This defaults to 2 so the info is in bits.
            - chars_to_ignore - A listing of characterw which should be ignored
              in calculating the info content.

        Returns:
            - A number representing the info content for the specified region.

        Please see the Biopython manual for more information on how information
        content is calculated.
        """
        # if no end was specified, then we default to the end of the sequence
        if end is None:
            end = len(self.alignment._records[0].seq)

        if start < 0 or end > len(self.alignment._records[0].seq):
            raise ValueError("Start (%s) and end (%s) are not in the \
                    range %s to %s"
                    % (start, end, 0, len(self.alignment._records[0].seq)))
        # determine random expected frequencies, if necessary
        random_expected = None
        if not e_freq_table:
            # TODO - What about ambiguous alphabets?
            base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
            if isinstance(base_alpha, Alphabet.ProteinAlphabet):
                random_expected = Protein20Random
            elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
                random_expected = Nucleotide4Random
            else:
                errstr = "Error in alphabet: not Nucleotide or Protein, "
                errstr += "supply expected frequencies"
                raise ValueError(errstr)
            del base_alpha
        elif not isinstance(e_freq_table, FreqTable.FreqTable):
            raise ValueError("e_freq_table should be a FreqTable object")

        # determine all of the letters we have to deal with
        all_letters = self._get_all_letters()
        for char in chars_to_ignore:
            all_letters = all_letters.replace(char, '')

        info_content = {}
        for residue_num in range(start, end):
            freq_dict = self._get_letter_freqs(residue_num,
                                               self.alignment._records,
                                               all_letters, chars_to_ignore)
            # print freq_dict,
            column_score = self._get_column_info_content(freq_dict,
                                                         e_freq_table,
                                                         log_base,
                                                         random_expected)

            info_content[residue_num] = column_score
        # sum up the score
        total_info = sum(info_content.values())
        # fill in the ic_vector member: holds IC for each column
        for i in info_content:
            self.ic_vector[i] = info_content[i]
        return total_info