def test_quality_filter_sequence_pass(self):
     """quality_filter_sequence functions as expected for good read
     """
     header = "990:2:4:11271:5323#1/1"
     sequence = \
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
     quality =  \
      "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
     actual = quality_filter_sequence(header,
                                      sequence,
                                      quality,
                                      max_bad_run_length=0,
                                      last_bad_quality_char='B',
                                      min_per_read_length=75,
                                      seq_max_N=0,
                                      filter_bad_illumina_qual_digit=True)
     self.assertEqual(actual,(0,
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
      "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
    def test_quality_filter_sequence_fail_w_N(self):
        """quality_filter_sequence handles N as expected
        """
        
        # 'N' in sequence causes failure
        header = "990:2:4:11271:5323#1/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)
        expected = (2,
         "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")
        self.assertEqual(actual,expected)
        
        # increasing max N rescues sequence
        header = "990:2:4:11271:5323#1/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=1,
                                         filter_bad_illumina_qual_digit=True)

        expected = (0,
         "GCACTCACCGCCCGTCACACCACGAAAGTNGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`")
        self.assertEqual(actual,expected)
        
        # truncation of N rescues sequence (sequence is truncated when 
        # the quality hits B, and the truncated sequence is above the 
        # length threshold and no longer contains an N)
        header = "990:2:4:11271:5323#1/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTN"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^B`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=50,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)

        expected = (0,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTG",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^")
        self.assertEqual(actual,expected)
 def test_quality_filter_sequence_fail_w_B(self):
     """quality_filter_sequence handles bad qual score as expected
     """
     
     # early 'B' in sequence causes truncation and too short of a read
     header = "990:2:4:11271:5323#1/1"
     sequence = \
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
     quality =  \
      "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
     actual = quality_filter_sequence(header,
                                      sequence,
                                      quality,
                                      max_bad_run_length=0,
                                      last_bad_quality_char='B',
                                      min_per_read_length=75,
                                      seq_max_N=0,
                                      filter_bad_illumina_qual_digit=True)
     self.assertEqual(actual,(1,"GCACTCACCGCCCGTCAC","bbbbbbbbbbbbbbbbbb"))
     
     # increasing max_bad_run_length rescues read
     header = "990:2:4:11271:5323#1/1"
     sequence = \
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
     quality =  \
      "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
     actual = quality_filter_sequence(header,
                                      sequence,
                                      quality,
                                      max_bad_run_length=1,
                                      last_bad_quality_char='B',
                                      min_per_read_length=75,
                                      seq_max_N=0,
                                      filter_bad_illumina_qual_digit=True)
     self.assertEqual(actual,(0,
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
      "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
     
     # changing threshold rescues read
     header = "990:2:4:11271:5323#1/1"
     sequence = \
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
     quality =  \
      "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
     actual = quality_filter_sequence(header,
                                      sequence,
                                      quality,
                                      max_bad_run_length=0,
                                      last_bad_quality_char='A',
                                      min_per_read_length=75,
                                      seq_max_N=0,
                                      filter_bad_illumina_qual_digit=True)
     self.assertEqual(actual,(0,
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
      "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
     
     # changing min_per_read_length rescues read
     header = "990:2:4:11271:5323#1/1"
     sequence = \
      "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
     quality =  \
      "bbbbbbbbbbbbbbbbbbBbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
     actual = quality_filter_sequence(header,
                                      sequence,
                                      quality,
                                      max_bad_run_length=0,
                                      last_bad_quality_char='B',
                                      min_per_read_length=5,
                                      seq_max_N=0,
                                      filter_bad_illumina_qual_digit=True)
     self.assertEqual(actual,(0,"GCACTCACCGCCCGTCAC","bbbbbbbbbbbbbbbbbb"))
    def test_quality_filter_illumina_qual(self):
        """quality_filter_sequence functions as expected with bad illumina qual digit
        """
        # header with no qual data passes
        header = "990:2:4:11271:5323/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)
        self.assertEqual(actual,(0,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
         
        # header with no qual data passes
        header = "990:2:4:11271:5323/0"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)
        self.assertEqual(actual,(0,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
         
        # header with no qual data passes (old barcode in header format)
        header = "HWI-6X_9267:1:1:4:1699#ACCACCC/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)
        self.assertEqual(actual,(0,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))
        
        # bad qual fails filter
        header = "@HWI-ST753_50:6:1101:1138:1965#0/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)
        self.assertEqual(actual,(3,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))

        # bad qual passes filter if filter turned off
        header = "@HWI-ST753_50:6:1101:1138:1965#0/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=False)
        self.assertEqual(actual,(0,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))

        # good qual passes filter
        header = "@HWI-ST753_50:6:1101:1138:1965#1/1"
        sequence = \
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC"
        quality =  \
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"
        actual = quality_filter_sequence(header,
                                         sequence,
                                         quality,
                                         max_bad_run_length=0,
                                         last_bad_quality_char='B',
                                         min_per_read_length=75,
                                         seq_max_N=0,
                                         filter_bad_illumina_qual_digit=True)
        self.assertEqual(actual,(0,
         "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC",
         "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`"))