def test_sanger_variant(self): # test entire range of possible ascii chars for sanger all_sanger_ascii = ('!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOP' 'QRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~') obs = _decode_qual_to_phred(all_sanger_ascii, variant='sanger') npt.assert_equal(obs, np.arange(94)) with self.assertRaises(ValueError) as cm: _decode_qual_to_phred('a b', variant='sanger') self.assertIn('[0, 93]', str(cm.exception))
def test_illumina18_variant(self): # test entire range of possible ascii chars for illumina1.8 all_illumina18_ascii = ('!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKL' 'MNOPQRSTUVWXYZ[\\]^_') obs = _decode_qual_to_phred(all_illumina18_ascii, variant='illumina1.8') npt.assert_equal(obs, np.arange(63)) with self.assertRaises(ValueError) as cm: _decode_qual_to_phred('AaB', variant='illumina1.8') self.assertIn('[0, 62]', str(cm.exception))
def test_illumina13_variant(self): # test entire range of possible ascii chars for illumina1.3 all_illumina13_ascii = ('@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijk' 'lmnopqrstuvwxyz{|}~') obs = _decode_qual_to_phred(all_illumina13_ascii, variant='illumina1.3') npt.assert_equal(obs, np.arange(63)) with self.assertRaises(ValueError) as cm: _decode_qual_to_phred('a!b', variant='illumina1.3') self.assertIn('[0, 62]', str(cm.exception))
def _parse_quality_scores(fh, seq_len, variant, phred_offset, prev): phred_scores = [] qual_len = 0 for chunk in _line_generator(fh, skip_blanks=False): if chunk: if chunk.startswith('@') and qual_len == seq_len: return np.hstack(phred_scores), chunk else: if not prev: _blank_error("after '+' or within quality scores") qual_len += len(chunk) if qual_len > seq_len: raise FASTQFormatError( "Found more quality score characters than sequence " "characters. Extra quality score characters: %r" % chunk[-(qual_len - seq_len):]) phred_scores.append( _decode_qual_to_phred(chunk, variant=variant, phred_offset=phred_offset)) prev = chunk if qual_len != seq_len: raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.") return np.hstack(phred_scores), None
def _qseq_to_generator(fh, constructor=Sequence, filter=_will_filter, phred_offset=_default_phred_offset, variant=_default_variant, **kwargs): for line in fh: (machine_name, run, lane, tile, x, y, index, read, seq, raw_qual, filtered) = _record_parser(line) if not filter or not filtered: phred = _decode_qual_to_phred(raw_qual, variant, phred_offset) seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (machine_name, run, lane, tile, x, y, index, read) yield constructor(seq, metadata={ 'id': seq_id, 'machine_name': machine_name, 'run_number': int(run), 'lane_number': int(lane), 'tile_number': int(tile), 'x': int(x), 'y': int(y), 'index': int(index), 'read_number': int(read) }, positional_metadata={'quality': phred}, **kwargs)
def _qseq_to_generator(fh, constructor=Sequence, filter=_will_filter, phred_offset=_default_phred_offset, variant=_default_variant): for line in fh: (machine_name, run, lane, tile, x, y, index, read, seq, raw_qual, filtered) = _record_parser(line) if not filter or not filtered: phred = _decode_qual_to_phred(raw_qual, variant, phred_offset) seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % ( machine_name, run, lane, tile, x, y, index, read) yield constructor(seq, quality=phred, id=seq_id)
def _qseq_to_generator(fh, constructor=BiologicalSequence, filter=_will_filter, phred_offset=_default_phred_offset, variant=_default_variant): for line in fh: (machine_name, run, lane, tile, x, y, index, read, seq, raw_qual, filtered) = _record_parser(line) if not filter or not filtered: phred = _decode_qual_to_phred(raw_qual, variant, phred_offset) seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % ( machine_name, run, lane, tile, x, y, index, read) yield constructor(seq, quality=phred, id=seq_id)
def test_custom_phred_offset(self): ascii_chars = '*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\' obs = _decode_qual_to_phred(ascii_chars, phred_offset=42) npt.assert_equal(obs, np.arange(51)) with self.assertRaises(ValueError) as cm: _decode_qual_to_phred(ascii_chars, phred_offset=43) self.assertIn('[0, 83]', str(cm.exception)) with self.assertRaises(ValueError) as cm: _decode_qual_to_phred(ascii_chars, phred_offset=0) self.assertIn('`phred_offset`', str(cm.exception)) self.assertIn('printable', str(cm.exception)) with self.assertRaises(ValueError) as cm: _decode_qual_to_phred(ascii_chars, phred_offset=127) self.assertIn('`phred_offset`', str(cm.exception)) self.assertIn('printable', str(cm.exception))
def _parse_quality_scores(fh, seq_len, variant, phred_offset): phred_scores = [] qual_len = 0 for chunk in _line_generator(fh): if chunk.startswith('@') and qual_len == seq_len: return phred_scores, chunk else: qual_len += len(chunk) if qual_len > seq_len: raise FASTQFormatError( "Found more quality score characters than sequence " "characters. Extra quality score characters: %r" % chunk[-(qual_len - seq_len):]) phred_scores.extend( _decode_qual_to_phred(chunk, variant=variant, phred_offset=phred_offset)) if qual_len != seq_len: raise FASTQFormatError( "Found incomplete/truncated FASTQ record at end of file.") return phred_scores, None
def test_solexa_variant(self): with self.assertRaises(NotImplementedError) as cm: _decode_qual_to_phred('abcd', variant='solexa') self.assertIn('719', str(cm.exception))
def test_empty_qual_str(self): self.assertEqual(_decode_qual_to_phred('', variant='sanger'), [])
def test_empty_qual_str(self): npt.assert_equal(_decode_qual_to_phred('', variant='sanger'), np.array([], dtype=np.uint8))
def test_unrecognized_variant(self): with self.assertRaises(ValueError) as cm: _decode_qual_to_phred('abcd', variant='illumina') self.assertIn('variant', str(cm.exception)) self.assertIn("'illumina'", str(cm.exception))
def test_variant_and_phred_offset_provided(self): with self.assertRaises(ValueError) as cm: _decode_qual_to_phred('abcd', variant='sanger', phred_offset=64) self.assertIn('both', str(cm.exception)) self.assertIn('`variant`', str(cm.exception)) self.assertIn('`phred_offset`', str(cm.exception))
def test_missing_variant_and_phred_offset(self): with self.assertRaises(ValueError) as cm: _decode_qual_to_phred('abcd') self.assertIn('`variant`', str(cm.exception)) self.assertIn('`phred_offset`', str(cm.exception)) self.assertIn('decode', str(cm.exception))