def test_quality_filter_sequence_pass(self): """quality_filter_sequence functions as expected for good read """ header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
def test_quality_filter_sequence_fail_w_N(self): """quality_filter_sequence handles N as expected """ # 'N' in sequence causes failure header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) expected = (2, "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`") self.assertEqual(actual,expected) # increasing max N rescues sequence header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=1, filter_bad_illumina_qual_digit=True) expected = (0, "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`") self.assertEqual(actual,expected) # truncation of N rescues sequence (sequence is truncated when # the quality hits B, and the truncated sequence is above the # length threshold and no longer contains an N) header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTN" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^B`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=50, seq_max_N=0, filter_bad_illumina_qual_digit=True) expected = (0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTG", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^") self.assertEqual(actual,expected)
def test_quality_filter_sequence_fail_w_B(self): """quality_filter_sequence handles bad qual score as expected """ # early 'B' in sequence causes truncation and too short of a read header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(1,"GCACTCACCGCCCGTCAC","bbbbbbbbbbbbbbbbbb")) # increasing max_bad_run_length rescues read header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=1, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # changing threshold rescues read header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='A', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # changing min_per_read_length rescues read header = "990:2:4:11271:5323#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=5, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0,"GCACTCACCGCCCGTCAC","bbbbbbbbbbbbbbbbbb"))
def test_quality_filter_illumina_qual(self): """quality_filter_sequence functions as expected with bad illumina qual digit """ # header with no qual data passes header = "990:2:4:11271:5323/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # header with no qual data passes header = "990:2:4:11271:5323/0" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # header with no qual data passes (old barcode in header format) header = "HWI-6X_9267:1:1:4:1699#ACCACCC/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # bad qual fails filter header = "@HWI-ST753_50:6:1101:1138:1965#0/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(3, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # bad qual passes filter if filter turned off header = "@HWI-ST753_50:6:1101:1138:1965#0/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=False) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")) # good qual passes filter header = "@HWI-ST753_50:6:1101:1138:1965#1/1" sequence = \ "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC" quality = \ "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`" actual = quality_filter_sequence(header, sequence, quality, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, seq_max_N=0, filter_bad_illumina_qual_digit=True) self.assertEqual(actual,(0, "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))