def convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert between two alignment files, returns number of alignments. - in_file - an input handle or filename - in_format - input file format, lower case string - output - an output handle or filename - out_file - output file format, lower case string - alphabet - optional alphabet to assume **NOTE** - If you provide an output filename, it will be opened which will overwrite any existing file without warning. This may happen if even the conversion is aborted (e.g. an invalid out_format name is given). """ # TODO - Add optimised versions of important conversions # For now just off load the work to SeqIO parse/write with as_handle(in_file, 'rU') as in_handle: # Don't open the output file until we've checked the input is OK: alignments = parse(in_handle, in_format, None, alphabet) # This will check the arguments and issue error messages, # after we have opened the file which is a shame. with as_handle(out_file, 'w') as out_handle: count = write(alignments, out_handle, out_format) return count
def convert(in_file, in_format, out_file, out_format, alphabet=None): """Convert between two sequence file formats, return number of records. - in_file - an input handle or filename - in_format - input file format, lower case string - out_file - an output handle or filename - out_format - output file format, lower case string - alphabet - optional alphabet to assume **NOTE** - If you provide an output filename, it will be opened which will overwrite any existing file without warning. This may happen if even the conversion is aborted (e.g. an invalid out_format name is given). For example, going from a filename to a handle: >>> from anarci.Bio import SeqIO >>> try: ... from StringIO import StringIO # Python 2 ... except ImportError: ... from io import StringIO # Python 3 ... >>> handle = StringIO("") >>> SeqIO.convert("Quality/example.fastq", "fastq", handle, "fasta") 3 >>> print(handle.getvalue()) >EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC >EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA >EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG <BLANKLINE> """ # Hack for SFF, will need to make this more general in future if in_format in _BinaryFormats: in_mode = 'rb' else: in_mode = 'rU' # Don't open the output file until we've checked the input is OK? if out_format in ["sff", "sff_trim"]: out_mode = 'wb' else: out_mode = 'w' # This will check the arguments and issue error messages, # after we have opened the file which is a shame. from ._convert import _handle_convert # Lazy import with as_handle(in_file, in_mode) as in_handle: with as_handle(out_file, out_mode) as out_handle: count = _handle_convert(in_handle, in_format, out_handle, out_format, alphabet) return count
def write(alignments, handle, format): """Write complete set of alignments to a file. Arguments: - alignments - A list (or iterator) of Alignment objects (ideally the new MultipleSeqAlignment objects), or (if using Biopython 1.54 or later) a single alignment object. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of alignments written (as an integer). """ from anarci.Bio import SeqIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(alignments, Alignment): # This raised an exception in older versions of Biopython alignments = [alignments] with as_handle(handle, 'w') as fp: # Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(alignments) elif format in SeqIO._FormatToWriter: # Exploit the existing SeqIO parser to do the dirty work! # TODO - Can we make one call to SeqIO.write() and count the alignments? count = 0 for alignment in alignments: if not isinstance(alignment, Alignment): raise TypeError("Expect a list or iterator of Alignment " "objects, got: %r" % alignment) SeqIO.write(alignment, fp, format) count += 1 elif format in _FormatToIterator or format in SeqIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the alignment count, not %s" \ % (format, repr(count)) return count
def parse(handle, format=None, **kwargs): """Turns a search output file into a generator that yields QueryResult objects. - handle - Handle to the file, or the filename as a string. - format - Lower case string denoting one of the supported formats. - kwargs - Format-specific keyword arguments. This function is used to iterate over each query in a given search output file: >>> from anarci.Bio import SearchIO >>> qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') >>> qresults <generator object ...> >>> for qresult in qresults: ... print("Search %s has %i hits" % (qresult.id, len(qresult))) ... Search 33211 has 100 hits Search 33212 has 44 hits Search 33213 has 95 hits Depending on the file format, `parse` may also accept additional keyword argument(s) that modifies the behavior of the format parser. Here is a simple example, where the keyword argument enables parsing of a commented BLAST tabular output file: >>> from anarci.Bio import SearchIO >>> for qresult in SearchIO.parse('Blast/mirna.tab', 'blast-tab', comments=True): ... print("Search %s has %i hits" % (qresult.id, len(qresult))) ... Search 33211 has 100 hits Search 33212 has 44 hits Search 33213 has 95 hits """ # get the iterator object and do error checking iterator = get_processor(format, _ITERATOR_MAP) # HACK: force BLAST XML decoding to use utf-8 handle_kwargs = {} if format == 'blast-xml' and sys.version_info[0] > 2: handle_kwargs['encoding'] = 'utf-8' # and start iterating with as_handle(handle, 'rU', **handle_kwargs) as source_file: generator = iterator(source_file, **kwargs) for qresult in generator: yield qresult
def write(qresults, handle, format=None, **kwargs): """Writes QueryResult objects to a file in the given format. - qresults - An iterator returning QueryResult objects or a single QueryResult object. - handle - Handle to the file, or the filename as a string. - format - Lower case string denoting one of the supported formats. - kwargs - Format-specific keyword arguments. The `write` function writes QueryResult object(s) into the given output handle / filename. You can supply it with a single QueryResult object or an iterable returning one or more QueryResult objects. In both cases, the function will return a tuple of four values: the number of QueryResult, Hit, HSP, and HSPFragment objects it writes to the output file:: from anarci.Bio import SearchIO qresults = SearchIO.parse('Blast/mirna.xml', 'blast-xml') SearchIO.write(qresults, 'results.tab', 'blast-tab') <stdout> (3, 239, 277, 277) The output of different formats may be adjusted using the format-specific keyword arguments. Here is an example that writes BLAT PSL output file with a header:: from anarci.Bio import SearchIO qresults = SearchIO.parse('Blat/psl_34_001.psl', 'blat-psl') SearchIO.write(qresults, 'results.tab', 'blat-psl', header=True) <stdout> (2, 13, 22, 26) """ # turn qresults into an iterator if it's a single QueryResult object if isinstance(qresults, QueryResult): qresults = iter([qresults]) else: qresults = iter(qresults) # get the writer object and do error checking writer_class = get_processor(format, _WRITER_MAP) # write to the handle with as_handle(handle, 'w') as target_file: writer = writer_class(target_file, **kwargs) # count how many qresults, hits, and hsps qresult_count, hit_count, hsp_count, frag_count = \ writer.write_file(qresults) return qresult_count, hit_count, hsp_count, frag_count
def parse(handle, format='fasta', alphabet=None): r"""Turns a sequence file into an iterator returning SeqRecords. - handle - handle to the file, or the filename as a string (note older versions of Biopython only took a handle). - format - lower case string describing the file format. - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. format="fasta" or "tab") Typical usage, opening a file to read in, and looping over the record(s): >>> from anarci.Bio import SeqIO >>> filename = "Fasta/sweetpea.nu" >>> for record in SeqIO.parse(filename, "fasta"): ... print("ID %s" % record.id) ... print("Sequence length %i" % len(record)) ... print("Sequence alphabet %s" % record.seq.alphabet) ID gi|3176602|gb|U78617.1|LOU78617 Sequence length 309 Sequence alphabet SingleLetterAlphabet() For file formats like FASTA where the alphabet cannot be determined, it may be useful to specify the alphabet explicitly: >>> from anarci.Bio import SeqIO >>> from anarci.Bio.Alphabet import generic_dna >>> filename = "Fasta/sweetpea.nu" >>> for record in SeqIO.parse(filename, "fasta", generic_dna): ... print("ID %s" % record.id) ... print("Sequence length %i" % len(record)) ... print("Sequence alphabet %s" % record.seq.alphabet) ID gi|3176602|gb|U78617.1|LOU78617 Sequence length 309 Sequence alphabet DNAAlphabet() If you have a string 'data' containing the file contents, you must first turn this into a handle in order to parse it: >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n" >>> from anarci.Bio import SeqIO >>> try: ... from StringIO import StringIO # Python 2 ... except ImportError: ... from io import StringIO # Python 3 ... >>> for record in SeqIO.parse(StringIO(data), "fasta"): ... print("%s %s" % (record.id, record.seq)) Alpha ACCGGATGTA Beta AGGCTCGGTTA Use the Bio.SeqIO.read(...) function when you expect a single record only. """ # NOTE - The above docstring has some raw \n characters needed # for the StringIO example, hence the whole docstring is in raw # string mode (see the leading r before the opening quote). from anarci.Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if alphabet is not None and not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet, %r" % alphabet) mode = 'rU' with as_handle(handle, mode) as fp: # Map the file format to a sequence iterator: if format in _FormatToIterator: iterator_generator = _FormatToIterator[format] if alphabet is None: i = iterator_generator(fp) else: try: i = iterator_generator(fp, alphabet=alphabet) except TypeError: i = _force_alphabet(iterator_generator(fp), alphabet) elif format in AlignIO._FormatToIterator: # Use Bio.AlignIO to read in the alignments i = (r for alignment in AlignIO.parse(fp, format, alphabet=alphabet) for r in alignment) else: raise ValueError("Unknown format '%s'" % format) # This imposes some overhead... wait until we drop Python 2.4 to fix it for r in i: yield r
def write(sequences, handle, format): """Write complete set of sequences to a file. - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. You should close the handle after calling this function. Returns the number of records written (as an integer). """ from anarci.Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(handle, SeqRecord): raise TypeError("Check arguments, handle should NOT be a SeqRecord") if isinstance(handle, list): # e.g. list of SeqRecord objects raise TypeError("Check arguments, handle should NOT be a list") if isinstance(sequences, SeqRecord): # This raised an exception in order version of Biopython sequences = [sequences] if format in _BinaryFormats: mode = 'wb' else: mode = 'w' with as_handle(handle, mode) as fp: # Map the file format to a writer class if format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: # Try and turn all the records into a single alignment, # and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) assert alignment_count == 1, \ "Internal error - the underlying writer " \ " should have returned 1, not %r" % alignment_count count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format) else: raise ValueError("Unknown format '%s'" % format) assert isinstance(count, int), "Internal error - the underlying %s " \ "writer should have returned the record count, not %r" \ % (format, count) return count
def parse(handle, format, seq_count=None, alphabet=None): """Iterate over an alignment file as MultipleSeqAlignment objects. Arguments: - handle - handle to the file, or the filename as a string (note older versions of Biopython only took a handle). - format - string describing the file format. - alphabet - optional Alphabet object, useful when the sequence type cannot be automatically inferred from the file itself (e.g. fasta, phylip, clustal) - seq_count - Optional integer, number of sequences expected in each alignment. Recommended for fasta format files. If you have the file name in a string 'filename', use: >>> from anarci.Bio import AlignIO >>> filename = "Emboss/needle.txt" >>> format = "emboss" >>> for alignment in AlignIO.parse(filename, format): ... print("Alignment of length %i" % alignment.get_alignment_length()) Alignment of length 124 Alignment of length 119 Alignment of length 120 Alignment of length 118 Alignment of length 125 If you have a string 'data' containing the file contents, use:: from anarci.Bio import AlignIO from StringIO import StringIO my_iterator = AlignIO.parse(StringIO(data), format) Use the Bio.AlignIO.read() function when you expect a single record only. """ from anarci.Bio import SeqIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if alphabet is not None and not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet, %s" % repr(alphabet)) if seq_count is not None and not isinstance(seq_count, int): raise TypeError("Need integer for seq_count (sequences per alignment)") with as_handle(handle, 'rU') as fp: # Map the file format to a sequence iterator: if format in _FormatToIterator: iterator_generator = _FormatToIterator[format] if alphabet is None: i = iterator_generator(fp, seq_count) else: try: # Initially assume the optional alphabet argument is supported i = iterator_generator(fp, seq_count, alphabet=alphabet) except TypeError: # It isn't supported. i = _force_alphabet(iterator_generator(fp, seq_count), alphabet) elif format in TCRDB.SeqIO._FormatToIterator: # Exploit the existing SeqIO parser to the dirty work! i = _SeqIO_to_alignment_iterator(fp, format, alphabet=alphabet, seq_count=seq_count) else: raise ValueError("Unknown format '%s'" % format) # This imposes some overhead... wait until we drop Python 2.4 to fix it for a in i: yield a