def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev): phred_scores = [] qual_len = 0 for chunk in _line_generator(fh, skip_blanks=False): if chunk: if chunk.startswith('@') and qual_len == seq_len: return np.hstack(phred_scores), chunk else: if not prev: _blank_error("after '+' or within quality scores") qual_len += len(chunk) if qual_len > seq_len: raise FASTQFormatError( "Found more quality score characters than sequence " "characters. Extra quality score characters: %r" % chunk[-(qual_len - seq_len):]) phred_scores.append( _decode_qual_to_phred(chunk, variant=variant, phred_offset=phred_offset)) prev = chunk if qual_len != seq_len: raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.") return np.hstack(phred_scores), None
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={'id': id_, 'description': desc}, positional_metadata={'quality': phred_scores})
def _parse_sequence_data(fh, prev): seq_chunks = [] for chunk in _line_generator(fh, skip_blanks=False): if chunk.startswith('+'): if not prev: _blank_error("before '+'") if not seq_chunks: raise FASTQFormatError( "Found FASTQ record without sequence data.") return ''.join(seq_chunks), chunk elif chunk.startswith('@'): raise FASTQFormatError( "Found FASTQ record that is missing a quality (+) header line " "after sequence data.") else: if not prev: _blank_error("after header or within sequence") if _whitespace_regex.search(chunk): raise FASTQFormatError( "Found whitespace in sequence data: %r" % chunk) seq_chunks.append(chunk) prev = chunk raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.")
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={'id': id_, 'description': desc}, positional_metadata={'quality': phred_scores}, **kwargs)
def _parse_fasta_raw(fh, data_parser, format_label): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise FASTAFormatError( "Found non-header line when attempting to read the 1st %s record:" "\n%s" % (format_label, seq_header)) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise FASTAFormatError( "Found blank or whitespace-only line within %s " "record." % format_label) data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc
def _parse_fasta_raw(fh, data_parser, error_type): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise error_type( "Found non-header line when attempting to read the 1st record:" "\n%s" % seq_header) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise error_type( "Found blank or whitespace-only line within record.") data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc