def write(alignments, handle, format): """Write complete set of alignments to a file. Arguments: - alignments - A list (or iterator) of Alignment objects (ideally the new MultipleSeqAlignment objects), or (if using Biopython 1.54 or later) a single alignment object. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of alignments written (as an integer). """ from anarci.Bio import SeqIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(alignments, Alignment): # This raised an exception in older versions of Biopython alignments = [alignments] with as_handle(handle, 'w') as fp: # Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(alignments) elif format in SeqIO._FormatToWriter: # Exploit the existing SeqIO parser to do the dirty work! # TODO - Can we make one call to SeqIO.write() and count the alignments? count = 0 for alignment in alignments: if not isinstance(alignment, Alignment): raise TypeError("Expect a list or iterator of Alignment " "objects, got: %r" % alignment) SeqIO.write(alignment, fp, format) count += 1 elif format in _FormatToIterator or format in SeqIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the alignment count, not %s" \ % (format, repr(count)) return count
def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None): """SeqIO conversion function (PRIVATE).""" try: f = _converter[(in_format, out_format)] except KeyError: f = None if f: return f(in_handle, out_handle, alphabet) else: records = SeqIO.parse(in_handle, in_format, alphabet) return SeqIO.write(records, out_handle, out_format)
def _embl_convert_fasta(in_handle, out_handle, alphabet=None): """Fast EMBL to FASTA (PRIVATE).""" # We don't need to parse the features... from anarci.Bio.GenBank.Scanner import EmblScanner records = EmblScanner().parse_records(in_handle, do_features=False) # For FASTA output we can ignore the alphabet too return SeqIO.write(records, out_handle, "fasta")
def _SeqIO_to_alignment_iterator(handle, format, alphabet=None, seq_count=None): """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE). Arguments: - handle - handle to the file. - format - string describing the file format. - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. fasta, phylip, clustal) - seq_count - Optional integer, number of sequences expected in each alignment. Recommended for fasta format files. If count is omitted (default) then all the sequences in the file are combined into a single MultipleSeqAlignment. """ from anarci.Bio import SeqIO if seq_count: # Use the count to split the records into batches. seq_record_iterator = SeqIO.parse(handle, format, alphabet) records = [] for record in seq_record_iterator: records.append(record) if len(records) == seq_count: yield MultipleSeqAlignment(records, alphabet) records = [] if records: raise ValueError("Check seq_count argument, not enough sequences?") else: # Must assume that there is a single alignment using all # the SeqRecord objects: records = list(SeqIO.parse(handle, format, alphabet)) if records: yield MultipleSeqAlignment(records, alphabet)
def __format__(self, format_spec): """Returns the record as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.SeqIO as an output file format. See also the SeqRecord's format() method. Under Python 3 please note that for binary formats a bytes string is returned, otherwise a (unicode) string is returned. """ if not format_spec: # Follow python convention and default to using __str__ return str(self) from anarci.Bio import SeqIO if format_spec in SeqIO._BinaryFormats: # Return bytes on Python 3 from io import BytesIO handle = BytesIO() else: from anarci.Bio._py3k import StringIO handle = StringIO() SeqIO.write(self, handle, format_spec) return handle.getvalue()
def _parse(handle): """Dynamically generated parser function (PRIVATE).""" try: return next(i(handle, alphabet=alphabet)) except TypeError: return next(SeqIO._force_alphabet(i(handle), alphabet))