def _parse_fasta_raw(fh, data_parser, format_label): """Raw parser for FASTA or QUAL files. Returns raw values (seq/qual, id, description). It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ line = next(fh) # header check inlined here and below for performance if line.startswith('>'): id_, desc = _parse_fasta_like_header(line) else: raise FASTAFormatError( "Found line without a header in %s-formatted file:\n%s" % (format_label, line)) data_chunks = [] for line in fh: if line.startswith('>'): # new header, so yield current record and reset state yield data_parser(data_chunks), id_, desc data_chunks = [] id_, desc = _parse_fasta_like_header(line) else: line = line.strip() if line: data_chunks.append(line) else: raise FASTAFormatError( "Found blank or whitespace-only line in %s-formatted " "file." % format_label) # yield last record in file yield data_parser(data_chunks), id_, desc
def _fasta_to_generator(fh, qual=FileSentinel, constructor=BiologicalSequence): if qual is None: for seq, id_, desc in _parse_fasta_raw(fh, _parse_sequence_data, 'FASTA'): yield constructor(seq, id=id_, description=desc) else: fasta_gen = _parse_fasta_raw(fh, _parse_sequence_data, 'FASTA') qual_gen = _parse_fasta_raw(qual, _parse_quality_scores, 'QUAL') for fasta_rec, qual_rec in zip_longest(fasta_gen, qual_gen, fillvalue=None): if fasta_rec is None: raise FASTAFormatError( "QUAL file has more records than FASTA file.") if qual_rec is None: raise FASTAFormatError( "FASTA file has more records than QUAL file.") fasta_seq, fasta_id, fasta_desc = fasta_rec qual_scores, qual_id, qual_desc = qual_rec if fasta_id != qual_id: raise FASTAFormatError( "IDs do not match between FASTA and QUAL records: %r != %r" % (fasta_id, qual_id)) if fasta_desc != qual_desc: raise FASTAFormatError( "Descriptions do not match between FASTA and QUAL " "records: %r != %r" % (fasta_desc, qual_desc)) # sequence and quality scores lengths are checked in constructor yield constructor(fasta_seq, id=fasta_id, description=fasta_desc, quality=qual_scores)
def _parse_quality_scores(chunks): if not chunks: raise FASTAFormatError("Found QUAL header without quality scores.") qual_str = ' '.join(chunks) try: return np.asarray(qual_str.split(), dtype=int) except ValueError: raise FASTAFormatError( "Could not convert quality scores to integers:\n%s" % qual_str)
def _fasta_to_generator(fh, qual=FileSentinel, constructor=Sequence, **kwargs): if qual is None: for seq, id_, desc in _parse_fasta_raw(fh, _parse_sequence_data, FASTAFormatError): yield constructor(seq, metadata={ 'id': id_, 'description': desc }, **kwargs) else: fasta_gen = _parse_fasta_raw(fh, _parse_sequence_data, FASTAFormatError) qual_gen = _parse_fasta_raw(qual, _parse_quality_scores, QUALFormatError) for fasta_rec, qual_rec in zip_longest(fasta_gen, qual_gen, fillvalue=None): if fasta_rec is None: raise FASTAFormatError( "QUAL file has more records than FASTA file.") if qual_rec is None: raise FASTAFormatError( "FASTA file has more records than QUAL file.") fasta_seq, fasta_id, fasta_desc = fasta_rec qual_scores, qual_id, qual_desc = qual_rec if fasta_id != qual_id: raise FASTAFormatError( "IDs do not match between FASTA and QUAL records: %r != %r" % (fasta_id, qual_id)) if fasta_desc != qual_desc: raise FASTAFormatError( "Descriptions do not match between FASTA and QUAL " "records: %r != %r" % (fasta_desc, qual_desc)) # sequence and quality scores lengths are checked in constructor yield constructor(fasta_seq, metadata={ 'id': fasta_id, 'description': fasta_desc }, positional_metadata={'quality': qual_scores}, **kwargs)
def _sniffer_data_parser(chunks): data = _parse_sequence_data(chunks) try: _parse_quality_scores(chunks) except QUALFormatError: return data else: # used for flow control within sniffer, user should never see this # message raise FASTAFormatError('Data appear to be quality scores.')
def _parse_sequence_data(chunks): if not chunks: raise FASTAFormatError("Found header without sequence data.") return ''.join(chunks)