def __format__(self, format_spec): """Returns the alignment as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.AlignIO as an output file format. See also the alignment's format() method.""" if format_spec: from SAP.Bio._py3k import StringIO from SAP.Bio import AlignIO handle = StringIO() AlignIO.write([self], handle, format_spec) return handle.getvalue() else: #Follow python convention and default to using __str__ return str(self)
def write(sequences, handle, format): """Write complete set of sequences to a file. - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of records written (as an integer). """ from SAP.Bio import AlignIO #Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(sequences, SeqRecord): #This raised an exception in order version of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: #Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: #Try and turn all the records into a single alignment, #and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %s" % repr(alignment_count) count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError("Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %s" \ % (format, repr(count)) return count
def write(sequences, handle, format): """Write complete set of sequences to a file. - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of records written (as an integer). """ from SAP.Bio import AlignIO #Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(sequences, SeqRecord): #This raised an exception in order version of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: #Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: #Try and turn all the records into a single alignment, #and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %s" % repr(alignment_count) count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %s" \ % (format, repr(count)) return count
if __name__=="__main__": import sys from SAP.Bio.Alphabet import generic_protein from SAP.Bio import AlignIO from SAP.Bio.PDB import PDBParser if len(sys.argv) != 4: print("Expects three arguments,") print(" - FASTA alignment filename (expect two sequences)") print(" - PDB file one") print(" - PDB file two") sys.exit() # The alignment fa=AlignIO.read(open(sys.argv[1]), "fasta", generic_protein) pdb_file1=sys.argv[2] pdb_file2=sys.argv[3] # The structures p=PDBParser() s1=p.get_structure('1', pdb_file1) p=PDBParser() s2=p.get_structure('2', pdb_file2) # Get the models m1=s1[0] m2=s2[0] al=StructureAlignment(fa, m1, m2)
def parse(handle, format, alphabet=None): r"""Turns a sequence file into an iterator returning SeqRecords. - handle - handle to the file, or the filename as a string (note older versions of Biopython only took a handle). - format - lower case string describing the file format. - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. format="fasta" or "tab") Typical usage, opening a file to read in, and looping over the record(s): >>> from SAP.Bio import SeqIO >>> filename = "Fasta/sweetpea.nu" >>> for record in SeqIO.parse(filename, "fasta"): ... print("ID %s" % record.id) ... print("Sequence length %i" % len(record)) ... print("Sequence alphabet %s" % record.seq.alphabet) ID gi|3176602|gb|U78617.1|LOU78617 Sequence length 309 Sequence alphabet SingleLetterAlphabet() For file formats like FASTA where the alphabet cannot be determined, it may be useful to specify the alphabet explicitly: >>> from SAP.Bio import SeqIO >>> from SAP.Bio.Alphabet import generic_dna >>> filename = "Fasta/sweetpea.nu" >>> for record in SeqIO.parse(filename, "fasta", generic_dna): ... print("ID %s" % record.id) ... print("Sequence length %i" % len(record)) ... print("Sequence alphabet %s" % record.seq.alphabet) ID gi|3176602|gb|U78617.1|LOU78617 Sequence length 309 Sequence alphabet DNAAlphabet() If you have a string 'data' containing the file contents, you must first turn this into a handle in order to parse it: >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n" >>> from SAP.Bio import SeqIO >>> try: ... from StringIO import StringIO # Python 2 ... except ImportError: ... from io import StringIO # Python 3 ... >>> for record in SeqIO.parse(StringIO(data), "fasta"): ... print("%s %s" % (record.id, record.seq)) Alpha ACCGGATGTA Beta AGGCTCGGTTA Use the Bio.SeqIO.read(...) function when you expect a single record only. """ #NOTE - The above docstring has some raw \n characters needed #for the StringIO example, hense the whole docstring is in raw #string mode (see the leading r before the opening quote). from SAP.Bio import AlignIO #Hack for SFF, will need to make this more general in future if format in _BinaryFormats: mode = 'rb' else: mode = 'rU' #Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if alphabet is not None and not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet, %s" % repr(alphabet)) with as_handle(handle, mode) as fp: #Map the file format to a sequence iterator: if format in _FormatToIterator: iterator_generator = _FormatToIterator[format] if alphabet is None: i = iterator_generator(fp) else: try: i = iterator_generator(fp, alphabet=alphabet) except TypeError: i = _force_alphabet(iterator_generator(fp), alphabet) elif format in AlignIO._FormatToIterator: #Use Bio.AlignIO to read in the alignments i = (r for alignment in AlignIO.parse(fp, format, alphabet=alphabet) for r in alignment) else: raise ValueError("Unknown format '%s'" % format) #This imposes some overhead... wait until we drop Python 2.4 to fix it for r in i: yield r
for pos in sorted(summary_info.ic_vector): fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos], summary_info.ic_vector[pos])) if __name__ == "__main__": print("Quick test") from SAP.Bio import AlignIO from SAP.Bio.Align.Generic import Alignment filename = "../../Tests/GFF/multi.fna" format = "fasta" expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25}, FreqTable.FREQ, IUPAC.unambiguous_dna) alignment = AlignIO.read(open(filename), format) for record in alignment: print(record.seq) print("="*alignment.get_alignment_length()) summary = SummaryInfo(alignment) consensus = summary.dumb_consensus(ambiguous="N") print(consensus) consensus = summary.gap_consensus(ambiguous="N") print(consensus) print("") print(summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus)) print("") #Have a generic alphabet, without a declared gap char, so must tell #provide the frequencies and chars to ignore explicitly.
if __name__ == "__main__": import sys from SAP.Bio.Alphabet import generic_protein from SAP.Bio import AlignIO from SAP.Bio.PDB import PDBParser if len(sys.argv) != 4: print("Expects three arguments,") print(" - FASTA alignment filename (expect two sequences)") print(" - PDB file one") print(" - PDB file two") sys.exit() # The alignment fa = AlignIO.read(open(sys.argv[1]), "fasta", generic_protein) pdb_file1 = sys.argv[2] pdb_file2 = sys.argv[3] # The structures p = PDBParser() s1 = p.get_structure('1', pdb_file1) p = PDBParser() s2 = p.get_structure('2', pdb_file2) # Get the models m1 = s1[0] m2 = s2[0] al = StructureAlignment(fa, m1, m2)
if __name__ == "__main__": print("Quick test") from SAP.Bio import AlignIO from SAP.Bio.Align.Generic import Alignment filename = "../../Tests/GFF/multi.fna" format = "fasta" expected = FreqTable.FreqTable({ "A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25 }, FreqTable.FREQ, IUPAC.unambiguous_dna) alignment = AlignIO.read(open(filename), format) for record in alignment: print(record.seq) print("=" * alignment.get_alignment_length()) summary = SummaryInfo(alignment) consensus = summary.dumb_consensus(ambiguous="N") print(consensus) consensus = summary.gap_consensus(ambiguous="N") print(consensus) print("") print( summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus)) print("") #Have a generic alphabet, without a declared gap char, so must tell