def test_fastq(self): "It guesses the format for the solexa and illumina fastq" txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n" txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n" fhand = StringIO(txt) assert guess_format(fhand) == "fastq-illumina" txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n" txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n" txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n" txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n" fhand = StringIO(txt + txt) assert guess_format(fhand) == "fastq-illumina-multiline" fhand = StringIO("@HWI-EAS209\n@") try: assert guess_format(fhand) == "fasta" self.fail("UnknownFormatError expected") except UnknownFormatError: pass # sanger txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n" txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "000000000000000000000000000000000000000000000000000000000000\n" fhand = StringIO(txt) assert guess_format(fhand) == "fastq"
def test_fastq(self): 'It guesses the format for the solexa and illumina fastq' txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n' fhand = StringIO(txt) assert guess_format(fhand) == 'fastq-illumina' txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n' txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n' fhand = StringIO(txt + txt) assert guess_format(fhand) == 'fastq-illumina-multiline' fhand = StringIO('@HWI-EAS209\n@') try: assert guess_format(fhand) == 'fasta' self.fail('UnknownFormatError expected') except UnknownFormatError: pass # sanger txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += '000000000000000000000000000000000000000000000000000000000000\n' fhand = StringIO(txt) assert guess_format(fhand) == 'fastq'
def test_unkown(self): "It tests unkown formats" fhand = StringIO("xseq\nACTC\n") try: guess_format(fhand) self.fail("UnknownFormatError expected") except UnknownFormatError: pass
def test_unkown(self): 'It tests unkown formats' fhand = StringIO('xseq\nACTC\n') try: guess_format(fhand) self.fail('UnknownFormatError expected') except UnknownFormatError: pass
def test_empty_file(self): 'It guesses the format of an empty file' fhand = StringIO() try: guess_format(fhand) self.fail('UnknownFormatError expected') except UnknownFormatError: pass
def test_empty_file(self): "It guesses the format of an empty file" fhand = StringIO() try: guess_format(fhand) self.fail("UnknownFormatError expected") except UnknownFormatError: pass
def test_long_illumina(self): "The qualities seem illumina, but the reads are too lengthly" txt = "@read\n" txt += "T" * 400 + "\n" txt += "+\n" txt += "@" * 400 + "\n" fhand = StringIO(txt) try: guess_format(fhand) self.fail("UndecidedFastqVersionError expected") except UndecidedFastqVersionError: pass
def test_long_illumina(self): 'The qualities seem illumina, but the reads are too lengthly' txt = '@read\n' txt += 'T' * 400 + '\n' txt += '+\n' txt += '@' * 400 + '\n' fhand = StringIO(txt) try: guess_format(fhand) self.fail('UndecidedFastqVersionError expected') except UndecidedFastqVersionError: pass
def _read_seqrecords(fhands, file_format=GUESS_FORMAT): 'It returns an iterator of seqrecords' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: fmt = guess_format(fhand) else: fmt = file_format fmt = remove_multiline(fmt) if fmt in ('fasta', 'qual') or 'fastq' in fmt: title = title2ids if fmt == 'fasta': seq_iter = FastaIterator(fhand, title2ids=title) elif fmt == 'qual': seq_iter = QualPhredIterator(fhand, title2ids=title) elif fmt == 'fastq' or fmt == 'fastq-sanger': seq_iter = FastqPhredIterator(fhand, title2ids=title) elif fmt == 'fastq-solexa': seq_iter = FastqSolexaIterator(fhand, title2ids=title) elif fmt == 'fastq-illumina': seq_iter = FastqIlluminaIterator(fhand, title2ids=title) else: seq_iter = parse_into_seqrecs(fhand, fmt) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def _index_seq_file(fpath, file_format=None): '''It indexes a seq file using Biopython index. It uses the title line line as the key and not just the id. ''' if file_format is None: file_format = guess_format(open(fpath)) file_format = _remove_multiline(file_format) # pylint: disable W0212 # we monkey patch to be able to index using the whole tile line and not # only the id. We need it because in a pair end file sequences with the # same id could be found accessor = _index._FormatToRandomAccess old_accessor = accessor.copy() accessor['fastq'] = FastqRandomAccess accessor['astq-sanger'] = FastqRandomAccess accessor['fastq-solexa'] = FastqRandomAccess accessor['fastq-illumina'] = FastqRandomAccess file_index = index(fpath, format=file_format) _index._FormatToRandomAccess = old_accessor return file_index
def _index_seq_file(fpath, file_format=None): '''It indexes a seq file using Biopython index. It uses the title line line as the key and not just the id. ''' if file_format is None: file_format = guess_format(open(fpath)) file_format = remove_multiline(file_format) # pylint: disable W0212 # we monkey patch to be able to index using the whole tile line and not # only the id. We need it because in a pair end file sequences with the # same id could be found accessor = _index._FormatToRandomAccess old_accessor = accessor.copy() accessor['fastq'] = FastqRandomAccess accessor['astq-sanger'] = FastqRandomAccess accessor['fastq-solexa'] = FastqRandomAccess accessor['fastq-illumina'] = FastqRandomAccess file_index = index(fpath, format=file_format) _index._FormatToRandomAccess = old_accessor return file_index
def test_fasta(self): "It guess fasta formats" fhand = StringIO(">seq\nACTC\n") assert guess_format(fhand) == "fasta" # multiline fasta fhand = StringIO(">seq\nACTC\nACTG\n>seq2\nACTG\n") assert guess_format(fhand) == "fasta" # qual fhand = StringIO(">seq\n10 20\n") assert guess_format(fhand) == "qual" # qual qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30" qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n" fhand = StringIO(qual) assert guess_format(fhand) == "qual"
def test_fasta(self): 'It guess fasta formats' fhand = StringIO('>seq\nACTC\n') assert guess_format(fhand) == 'fasta' # multiline fasta fhand = StringIO('>seq\nACTC\nACTG\n>seq2\nACTG\n') assert guess_format(fhand) == 'fasta' # qual fhand = StringIO('>seq\n10 20\n') assert guess_format(fhand) == 'qual' # qual qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30" qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n" fhand = StringIO(qual) assert guess_format(fhand) == 'qual'
def _read_seqitems(fhands, file_format): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: file_format = guess_format(fhand) else: file_format = file_format if file_format == 'fasta': seq_iter = _itemize_fasta(fhand) elif 'multiline' not in file_format and 'fastq' in file_format: seq_iter = _itemize_fastq(fhand) else: msg = 'Format not supported by the itemizers: ' + file_format raise NotImplementedError(msg) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def read_seqs(fhands, file_format=GUESS_FORMAT, out_format=None, prefered_seq_classes=None): 'It returns a stream of seqs in different codings: seqrecords, seqitems...' if not prefered_seq_classes: prefered_seq_classes = [SEQITEM, SEQRECORD] if file_format == GUESS_FORMAT: in_format = guess_format(fhands[0]) else: in_format = file_format if out_format not in (None, GUESS_FORMAT): if in_format != out_format: if SEQITEM in prefered_seq_classes: # seqitems is incompatible with different input and output # formats prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM)) if not prefered_seq_classes: msg = 'No valid seq class left or prefered' raise ValueError(msg) for seq_class in prefered_seq_classes: if seq_class == SEQITEM: try: return _read_seqitems(fhands, in_format) except NotImplementedError: continue elif seq_class == SEQRECORD: try: seqs = _read_seqrecords(fhands, in_format) return assing_kind_to_seqs(SEQRECORD, seqs, None) except NotImplementedError: continue else: raise ValueError('Unknown class for seq: ' + seq_class) raise RuntimeError('We should not be here, fixme')
def parse_basic_args(parser): 'It parses the command line and it returns a dict with the arguments.' parsed_args = parser.parse_args() # we have to wrap the file in a BufferedReader to allow peeking into stdin wrapped_fhands = [] # if input is stdin it will be a fhand not a list of fhands. # we have to convert to a list in_fhands = parsed_args.input if not isinstance(in_fhands, list): in_fhands = [in_fhands] for fhand in in_fhands: fhand = wrap_in_buffered_reader(fhand) fhand = uncompress_if_required(fhand) wrapped_fhands.append(fhand) # We have to add the one_line to the fastq files in order to get the # speed improvements of the seqitems in_format = parsed_args.in_format if 'fastq' in in_format: guessed_in_format = guess_format(wrapped_fhands[0]) if '-one_line' in guessed_in_format: in_format += '-one_line' else: guessed_in_format = None out_fhand = getattr(parsed_args, OUTFILE) comp_kind = get_requested_compression(parsed_args) if isinstance(out_fhand, list): new_out_fhands = [] for out_f in out_fhand: try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True): 'It converts sequence files between formats' if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'): raise IncompatibleFormatError("This output format is not supported") in_formats = [remove_multiline(guess_format(fhand)) for fhand in in_fhands] if len(in_fhands) == 1 and in_formats[0] == out_format: if copy_if_same_format: copyfileobj(in_fhands[0], out_fhand) else: rel_symlink(in_fhands[0].name, out_fhand.name) else: seqs = _read_seqrecords(in_fhands) try: write_seqrecs(seqs, out_fhand, out_format) except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) if 'No suitable quality scores' in str(error): msg = 'No qualities available to write output file' raise IncompatibleFormatError(msg) raise
def _read_seqitems(fhands, file_format): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: file_format = guess_format(fhand) else: file_format = file_format if file_format == 'fasta': seq_iter = _itemize_fasta(fhand) elif 'multiline' not in file_format and 'fastq' in file_format: try: seq_iter = _itemize_fastq(fhand) except ValueError as error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise else: msg = 'Format not supported by the itemizers: ' + file_format raise NotImplementedError(msg) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands else: try: out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) # The default output format is the same as the first file if in_format == GUESS_FORMAT: if not guessed_in_format: guessed_in_format = guess_format(wrapped_fhands[0]) out_format = guessed_in_format else: out_format = in_format # The original fhands should be stored, because otherwise they would be # closed args = {'out_fhand': out_fhand, 'in_fhands': wrapped_fhands, 'out_format': out_format, 'original_in_fhands': in_fhands, 'in_format': in_format} return args, parsed_args def parse_basic_parallel_args(parser): 'It parses the command line and it returns a dict with the arguments.' args, parsed_args = parse_basic_args(parser)
out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands else: try: out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) # The default output format is the same as the first file if in_format == GUESS_FORMAT: if not guessed_in_format: guessed_in_format = guess_format(wrapped_fhands[0]) out_format = guessed_in_format else: out_format = in_format # The original fhands should be stored, because otherwise they would be # closed args = { 'out_fhand': out_fhand, 'in_fhands': wrapped_fhands, 'out_format': out_format, 'original_in_fhands': in_fhands, 'in_format': in_format } return args, parsed_args