def _parse_fasta_raw(fh, data_parser, format_label): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ line = next(fh) # header check inlined here and below for performance if line.startswith('>'): id_, desc = _parse_fasta_like_header(line) else: raise FASTAFormatError( "Found line without a header in %s-formatted file:\n%s" % (format_label, line)) data_chunks = [] for line in fh: if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: line = line.strip() if line: data_chunks.append(line) else: raise FASTAFormatError( "Found blank or whitespace-only line in %s-formatted " "file." % format_label) # yield last record in file yield data_parser(data_chunks), id_, desc
def _parse_fasta_raw(fh, data_parser, format_label): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ line = next(fh) # header check inlined here and below for performance if line.startswith('>'): id_, desc = _parse_fasta_like_header(line) else: raise FASTAFormatError( "Found line without a header in %s-formatted file:\n%s" % (format_label, line)) data_chunks = [] for line in fh: if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: line = line.strip() if line: data_chunks.append(line) else: raise FASTAFormatError( "Found blank or whitespace-only line in %s-formatted " "file." % format_label) # yield last record in file yield data_parser(data_chunks), id_, desc
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={'id': id_, 'description': desc}, positional_metadata={'quality': phred_scores})
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh, seq_header) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset, qual_header) yield constructor(seq, metadata={'id': id_, 'description': desc}, positional_metadata={'quality': phred_scores}, **kwargs)
def _parse_fasta_raw(fh, data_parser, format_label): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise FASTAFormatError( "Found non-header line when attempting to read the 1st %s record:" "\n%s" % (format_label, seq_header)) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise FASTAFormatError( "Found blank or whitespace-only line within %s " "record." % format_label) data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc
def _parse_fasta_raw(fh, data_parser, error_type): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Skip any blank or whitespace-only lines at beginning of file seq_header = next(_line_generator(fh, skip_blanks=True)) # header check inlined here and below for performance if seq_header.startswith('>'): id_, desc = _parse_fasta_like_header(seq_header) else: raise error_type( "Found non-header line when attempting to read the 1st record:" "\n%s" % seq_header) data_chunks = [] prev = seq_header for line in _line_generator(fh, skip_blanks=False): if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: if line: # ensure no blank lines within a single record if not prev: raise error_type( "Found blank or whitespace-only line within record.") data_chunks.append(line) prev = line # yield last record in file yield data_parser(data_chunks), id_, desc
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=BiologicalSequence): seq_header = next(_line_generator(fh)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores(fh, len(seq), variant, phred_offset) yield constructor(seq, id=id_, description=desc, quality=phred_scores)
def _fastq_to_generator(fh, variant=None, phred_offset=None, constructor=BiologicalSequence): seq_header = next(_line_generator(fh)) if not seq_header.startswith('@'): raise FASTQFormatError( "Expected sequence (@) header line at start of file: %r" % seq_header) while seq_header is not None: id_, desc = _parse_fasta_like_header(seq_header) seq, qual_header = _parse_sequence_data(fh) if qual_header != '+' and qual_header[1:] != seq_header[1:]: raise FASTQFormatError( "Sequence (@) and quality (+) header lines do not match: " "%r != %r" % (seq_header[1:], qual_header[1:])) phred_scores, seq_header = _parse_quality_scores( fh, len(seq), variant, phred_offset) yield constructor(seq, id=id_, description=desc, quality=phred_scores)
def test_id_and_description(self): obs = _parse_fasta_like_header('>!thus suht! \t\t \n') self.assertEqual(obs, ('!thus', 'suht!'))
def test_description_only(self): obs = _parse_fasta_like_header('> suht! \t\t \n') self.assertEqual(obs, ('', 'suht!'))
def test_id_only(self): obs = _parse_fasta_like_header('>suht! \t\t \n') self.assertEqual(obs, ('suht!', ''))
def test_no_id_or_description(self): obs = _parse_fasta_like_header('> \t\t \n') self.assertEqual(obs, ('', ''))
def test_id_and_description(self): obs = _parse_fasta_like_header('>!thus suht! \t\t \n') self.assertEqual(obs, ('!thus', 'suht!'))
def test_description_only(self): obs = _parse_fasta_like_header('> suht! \t\t \n') self.assertEqual(obs, ('', 'suht!'))
def test_id_only(self): obs = _parse_fasta_like_header('>suht! \t\t \n') self.assertEqual(obs, ('suht!', ''))
def test_no_id_or_description(self): obs = _parse_fasta_like_header('> \t\t \n') self.assertEqual(obs, ('', ''))