def get_str_qualities(seq, out_format=None): if out_format is None: out_format = seq.file_format out_format = remove_multiline(out_format) if out_format in SANGER_FASTQ_FORMATS: out_format = SANGER_QUALITY elif out_format in ILLUMINA_FASTQ_FORMATS: out_format = ILLUMINA_QUALITY seq_class = seq.kind if seq_class == SEQITEM: in_format = remove_multiline(seq.file_format) if 'fasta' in in_format: raise ValueError('A fasta file has no qualities') if in_format in SANGER_FASTQ_FORMATS: in_format = SANGER_QUALITY elif in_format in ILLUMINA_FASTQ_FORMATS: in_format = ILLUMINA_QUALITY else: msg = 'Unknown or not supported quality format: ' msg += in_format raise ValueError(msg) if in_format == out_format: quals = ''.join(line.rstrip() for line in _get_seqitem_qual_lines(seq)) else: int_quals = get_int_qualities(seq) quals = ''.join(_int_quals_to_str_quals(int_quals, out_format)) elif seq_class == SEQRECORD: int_quals = get_int_qualities(seq) quals = ''.join(_int_quals_to_str_quals(int_quals, out_format)) return quals
def _write_filter_trim_packets(passed_fhand, diverted_fhand, packets, file_format='fastq', workers=None, seqs_diverted=SEQS_FILTERED_OUT): 'It writes the filter stream into passed and filtered out sequence files' file_format = remove_multiline(file_format) if diverted_fhand is None: seq_packets = (p[SEQS_PASSED] for p in packets) seqs = (s for pair in chain.from_iterable(seq_packets) for s in pair) try: return write_seqs(seqs, passed_fhand, file_format=file_format) except BaseException: if workers is not None: workers.terminate() raise flatten_pairs = lambda pairs: (seq for pair in pairs for seq in pair) for packet in packets: try: write_seqs(flatten_pairs(packet[SEQS_PASSED]), fhand=passed_fhand, file_format=file_format) # if diverted seqs are filtered aout they are a list of list # as not diverted seqs. # if they are orphan, they are a list of seqs if seqs_diverted == SEQS_FILTERED_OUT : seqs = flatten_pairs(packet[seqs_diverted]) else: seqs = packet[seqs_diverted] write_seqs(seqs, fhand=diverted_fhand, file_format=file_format) except BaseException: if workers is not None: workers.terminate() raise
def _read_seqrecords(fhands, file_format=GUESS_FORMAT): 'It returns an iterator of seqrecords' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: fmt = guess_format(fhand) else: fmt = file_format fmt = remove_multiline(fmt) if fmt in ('fasta', 'qual') or 'fastq' in fmt: title = title2ids if fmt == 'fasta': seq_iter = FastaIterator(fhand, title2ids=title) elif fmt == 'qual': seq_iter = QualPhredIterator(fhand, title2ids=title) elif fmt == 'fastq' or fmt == 'fastq-sanger': seq_iter = FastqPhredIterator(fhand, title2ids=title) elif fmt == 'fastq-solexa': seq_iter = FastqSolexaIterator(fhand, title2ids=title) elif fmt == 'fastq-illumina': seq_iter = FastqIlluminaIterator(fhand, title2ids=title) else: seq_iter = parse_into_seqrecs(fhand, fmt) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def write_filter_packets(passed_fhand, filtered_fhand, filter_packets, file_format='fastq', workers=None): 'It writes the filter stream into passed and filtered out sequence files' file_format = remove_multiline(file_format) if filtered_fhand is None: seq_packets = (p[SEQS_PASSED] for p in filter_packets) seqs = (s for pair in chain.from_iterable(seq_packets) for s in pair) try: return write_seqs(seqs, passed_fhand, file_format=file_format) except BaseException: if workers is not None: workers.terminate() raise flatten_pairs = lambda pairs: (seq for pair in pairs for seq in pair) for packet in filter_packets: try: write_seqs(flatten_pairs(packet[SEQS_PASSED]), fhand=passed_fhand, file_format=file_format) write_seqs(flatten_pairs(packet[SEQS_FILTERED_OUT]), fhand=filtered_fhand, file_format=file_format) except BaseException: if workers is not None: workers.terminate() raise
def _index_seq_file(fpath, file_format=None): '''It indexes a seq file using Biopython index. It uses the title line line as the key and not just the id. ''' if file_format is None: file_format = guess_format(open(fpath)) file_format = remove_multiline(file_format) # pylint: disable W0212 # we monkey patch to be able to index using the whole tile line and not # only the id. We need it because in a pair end file sequences with the # same id could be found accessor = _index._FormatToRandomAccess old_accessor = accessor.copy() accessor['fastq'] = FastqRandomAccess accessor['astq-sanger'] = FastqRandomAccess accessor['fastq-solexa'] = FastqRandomAccess accessor['fastq-illumina'] = FastqRandomAccess file_index = index(fpath, format=file_format) _index._FormatToRandomAccess = old_accessor return file_index
def slice_seq(seq, start=None, stop=None): seq_class = seq.kind if seq_class == SEQITEM: seq_obj = _slice_seqitem(seq, start, stop) elif seq_class == SEQRECORD: seq_obj = seq.object[start:stop] return SeqWrapper(seq.kind, object=seq_obj, file_format=remove_multiline(seq.file_format))
def write_seq_packets(fhand, seq_packets, file_format='fastq', workers=None): 'It writes to file a stream of seq lists' file_format = remove_multiline(file_format) try: write_seqs(chain.from_iterable(seq_packets), fhand, file_format=file_format) except BaseException: if workers is not None: workers.terminate() raise
def _write_seqrecords(seqs, fhand=None, file_format='fastq'): 'It writes a stream of sequences to a file' file_format = remove_multiline(file_format) if fhand is None: fhand = NamedTemporaryFile(suffix='.' + file_format.replace('-', '_')) seqs = _clean_seqrecord_stream(seqs) try: write_seqrecs(seqs, fhand, file_format) except IOError, error: # The pipe could be already closed if not 'Broken pipe' in str(error): raise
def _write_seqitems(items, fhand, file_format): 'It writes one seq item (tuple of name and string)' for seq in items: seqitems_fmt = remove_multiline(seq.file_format) if file_format and 'fastq' in seqitems_fmt and 'fasta' in file_format: seq_lines = seq.object.lines try: fhand.write('>' + seq_lines[0][1:] + seq_lines[1]) except IOError, error: # The pipe could be already closed if not 'Broken pipe' in str(error): raise elif file_format and seqitems_fmt != file_format: msg = 'Input and output file formats do not match, you should not ' msg += 'use SeqItems: ' + str(seq.file_format) + ' ' msg += str(file_format) raise RuntimeError(msg)
def _copy_seqitem(seqwrapper, seq=None, name=None): seq_item = seqwrapper.object lines = seq_item.lines fmt = seqwrapper.file_format if seq is None: lines = lines[:] else: if 'fasta' in fmt: lines = [lines[0], seq + '\n'] elif 'multiline' in fmt and 'fastq' in fmt: qline = ''.join([qline.strip() for qline in _get_seqitem_qual_lines(seqwrapper)]) lines = [lines[0], seq + '\n', '+\n', qline + '\n'] fmt = remove_multiline(fmt) if len(lines[1]) != len(lines[3]): msg = 'Sequence and quality line length do not match' raise ValueError(msg) elif 'fastq' in fmt: lines = [lines[0], seq + '\n', lines[2], lines[3]] if len(lines[1]) != len(lines[3]): msg = 'Sequence and quality line length do not match' raise ValueError(msg) else: raise RuntimeError('Unknown format for a SequenceItem') if name: lines[0] = lines[0][0] + name + '\n' if 'fastq' in fmt: if 'multiline' in fmt: line_plus_index = _sitem_fastq_plus_line_index else: line_plus_index = 2 lines[line_plus_index] = '+\n' name = seq_item.name if name is None else name annotations = seq_item.annotations if annotations is not None: annotations = annotations.copy() seq = SeqWrapper(kind=seqwrapper.kind, object=SeqItem(name, lines, annotations), file_format=fmt) return seq
def write_seqs(seqs, fhand=None, file_format=None): 'It writes the given sequences' if fhand is None: fhand = NamedTemporaryFile(suffix='.' + file_format.replace('-', '_')) file_format = remove_multiline(file_format) seqs, seqs2 = tee(seqs) try: seq = seqs2.next() except StopIteration: # No sequences to write, so we're done return fhand del seqs2 seq_class = seq.kind if seq_class == SEQITEM: _write_seqitems(seqs, fhand, file_format) elif seq_class == SEQRECORD: seqs = (seq.object for seq in seqs) _write_seqrecords(seqs, fhand, file_format) else: raise ValueError('Unknown class for seq: ' + seq_class) return fhand
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True): 'It converts sequence files between formats' if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'): raise IncompatibleFormatError("This output format is not supported") in_formats = [remove_multiline(guess_format(fhand)) for fhand in in_fhands] if len(in_fhands) == 1 and in_formats[0] == out_format: if copy_if_same_format: copyfileobj(in_fhands[0], out_fhand) else: rel_symlink(in_fhands[0].name, out_fhand.name) else: seqs = _read_seqrecords(in_fhands) try: write_seqrecs(seqs, out_fhand, out_format) except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) if 'No suitable quality scores' in str(error): msg = 'No qualities available to write output file' raise IncompatibleFormatError(msg) raise