Example #1
0
    def test_parse_illumina_line(self):
        """parse_illumina_line: functions with several lines """
        illumina_line0 = illumina_read1[0]
        illumina_line1 = illumina_read1[1]
        actual = parse_illumina_line(illumina_line0,
                                     barcode_length=6,
                                     rev_comp_barcode=True)
        expected = {\
         'Full description':'HWI-6X_9267:1:1:4:1699#ACCACCC/1',\
         'Machine Name':'HWI-6X_9267',\
         'Channel Number':1,\
         'Tile Number':1,\
         'X Position':4,\
         'Y Position':1699,\
         'Barcode':'GGTGGT',\
         'Full Y Position Field':'1699#ACCACCC/1',\
         'Sequence':\
          'TACGGAGGGTGCGAGCGTTAATCGCCCCCCCCCCCCCCCCCCCCCCCCCCCC'+\
          'CCCCCCCCCCCCCCCCCCCCCCCGAAAAAAAAAAAAAAAAAAAAAAA',\
         'Quality Score':\
          'abbbbbbbbbb`_`bbbbbb`bb^aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\
          'aaaaaaaaaaaaaDaabbBBBBBBBBBBBBBBBBBBB'}
        self.assertEqual(actual, expected)

        actual = parse_illumina_line(illumina_line0,
                                     barcode_length=6,
                                     rev_comp_barcode=False)
        expected['Barcode'] = 'ACCACC'

        actual = parse_illumina_line(illumina_line1,
                                     barcode_length=6,
                                     rev_comp_barcode=True)
        expected = {\
         'Full description':'HWI-6X_9267:1:1:4:390#ACCTCCC/1',\
         'Machine Name':'HWI-6X_9267',\
         'Channel Number':1,\
         'Tile Number':1,\
         'X Position':4,\
         'Y Position':390,\
         'Barcode':'GGAGGT',\
         'Full Y Position Field':'390#ACCTCCC/1',\
         'Sequence':\
          'GACAGGAGGAGCAAGTGTTATTCAAATTATGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGG'+\
          'GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAA',\
         'Quality Score':\
          'aaaaaaaaaa```aa\^_aa``aVaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\
          'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaBaaaaa'}
        self.assertEqual(actual, expected)

        actual = parse_illumina_line(illumina_line1,
                                     barcode_length=6,
                                     rev_comp_barcode=False)
        expected['Barcode'] = 'ACCTCC'
Example #2
0
 def test_parse_illumina_line_barcode_in_header(self):
     """parse_illumina_line: handles barcode in header correctly """
     illumina_line0 = illumina_read1[0]
     illumina_line1 = illumina_read1[1]
     actual = parse_illumina_line(
      illumina_line0,barcode_length=6,rev_comp_barcode=True)
     expected = {\
      'Full description':'HWI-6X_9267:1:1:4:1699#ACCACCC/1',\
      'Machine Name':'HWI-6X_9267',\
      'Channel Number':1,\
      'Tile Number':1,\
      'X Position':4,\
      'Y Position':1699,\
      'Barcode':'GGTGGT',\
      'Full Y Position Field':'1699#ACCACCC/1',\
      'Sequence':\
       'TACGGAGGGTGCGAGCGTTAATCGCCCCCCCCCCCCCCCCCCCCCCCCCCCC'+\
       'CCCCCCCCCCCCCCCCCCCCCCCGAAAAAAAAAAAAAAAAAAAAAAA',\
      'Quality Score':\
       'abbbbbbbbbb`_`bbbbbb`bb^aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\
       'aaaaaaaaaaaaaDaabbBBBBBBBBBBBBBBBBBBB'}
     self.assertEqual(actual,expected)
     
     actual = parse_illumina_line(
      illumina_line0,barcode_length=6,rev_comp_barcode=False)
     expected['Barcode'] = 'ACCACC'
     
     actual = parse_illumina_line(
      illumina_line1,barcode_length=6,rev_comp_barcode=True)
     expected = {\
      'Full description':'HWI-6X_9267:1:1:4:390#ACCTCCC/1',\
      'Machine Name':'HWI-6X_9267',\
      'Channel Number':1,\
      'Tile Number':1,\
      'X Position':4,\
      'Y Position':390,\
      'Barcode':'GGAGGT',\
      'Full Y Position Field':'390#ACCTCCC/1',\
      'Sequence':\
       'GACAGGAGGAGCAAGTGTTATTCAAATTATGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGG'+\
       'GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAA',\
      'Quality Score':\
       'aaaaaaaaaa```aa\^_aa``aVaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\
       'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaBaaaaa'}
     self.assertEqual(actual,expected)
     
     actual = parse_illumina_line(
      illumina_line1,barcode_length=6,rev_comp_barcode=False)
     expected['Barcode'] = 'ACCTCC'
Example #3
0
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\
    barcode_max_N=0,seq_max_N=0):
    """Parses Illumina paired-end read file pair
    """

    for read1_line, read2_line in izip(read1_file, read2_file):
        read1 = parse_illumina_line(read1_line, barcode_length,
                                    rev_comp_barcode)
        read2 = parse_illumina_line(read2_line, barcode_length,
                                    rev_comp_barcode)

        read1_desc = illumina_read_description_from_read_data(read1)
        read2_desc = illumina_read_description_from_read_data(read2)

        read1_barcode = read1['Barcode']
        read2_barcode = read2['Barcode']
        if (read1_barcode.count('N') > barcode_max_N) or \
           (read2_barcode.count('N') > barcode_max_N):
            continue

        if read1_desc != read2_desc:
            raise IlluminaParseError, \
              "Error in sequence files, descriptions of"+\
              " corresponding lines are not compatible: %s != %s" %\
              (read1_desc, read2_desc)
        assert read1_barcode == read2_barcode

        seq1, qual1 = read_qual_score_filter(\
         read1['Sequence'], read1['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq1) < min_per_read_length):
            continue

        seq2, qual2 = read_qual_score_filter(\
         read2['Sequence'], read2['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq2) < min_per_read_length):
            continue

        seq = seq1 + revComp(seq2)
        # If the total number of Ns is more than the max
        # allowed ignore this sequence
        if seq.count('N') > seq_max_N:
            continue
        qual = qual1 + qual2[::-1]

        yield read1_desc, read1_barcode, seq, qual
Example #4
0
def parse_illumina_single_end_read_file(read_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,
    rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0):
    """Parses Illumina single-end read file
    """

    for read_line in read_file:
        read = parse_illumina_line(read_line, barcode_length, rev_comp_barcode,
                                   barcode_in_seq)

        read_desc = illumina_read_description_from_read_data(read)

        read_barcode = read['Barcode']

        if read_barcode.count('N') > barcode_max_N:
            continue

        seq, qual = read_qual_score_filter(\
         read['Sequence'], read['Quality Score'],\
         max_bad_run_length, quality_threshold)

        if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N):
            continue

        if rev_comp:
            seq = revComp(seq)
            qual = qual[::-1]

        yield read_desc, read_barcode, seq, qual
def parse_illumina_single_end_read_file(read_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,
    rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0):
    """Parses Illumina single-end read file
    """
    
    for read_line in read_file:
        read = parse_illumina_line(read_line,barcode_length,
                                   rev_comp_barcode,barcode_in_seq)
        
        read_desc = illumina_read_description_from_read_data(read)
        
        read_barcode = read['Barcode']
        
        if read_barcode.count('N') > barcode_max_N:
           continue
        
        seq, qual = read_qual_score_filter(\
         read['Sequence'], read['Quality Score'],\
         max_bad_run_length, quality_threshold)
         
        if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N):
            continue
            
        if rev_comp:
            seq = DNA.rc(seq)
            qual = qual[::-1]
        
        yield read_desc, read_barcode, seq, qual
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\
    barcode_max_N=0,seq_max_N=0):
    """Parses Illumina paired-end read file pair
    """
    
    for read1_line, read2_line in izip(read1_file,read2_file):
        read1 = parse_illumina_line(read1_line,barcode_length,rev_comp_barcode)
        read2 = parse_illumina_line(read2_line,barcode_length,rev_comp_barcode)
        
        read1_desc = illumina_read_description_from_read_data(read1)
        read2_desc = illumina_read_description_from_read_data(read2)
        
        read1_barcode = read1['Barcode']
        read2_barcode = read2['Barcode']
        if (read1_barcode.count('N') > barcode_max_N) or \
           (read2_barcode.count('N') > barcode_max_N):
           continue
        
        if read1_desc != read2_desc:
            raise IlluminaParseError, \
              "Error in sequence files, descriptions of"+\
              " corresponding lines are not compatible: %s != %s" %\
              (read1_desc, read2_desc)
        assert read1_barcode == read2_barcode
        
        seq1, qual1 = read_qual_score_filter(\
         read1['Sequence'], read1['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq1) < min_per_read_length):
            continue
            
        seq2, qual2 = read_qual_score_filter(\
         read2['Sequence'], read2['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq2) < min_per_read_length):
            continue
            
        seq = seq1 + DNA.rc(seq2)
        # If the total number of Ns is more than the max 
        # allowed ignore this sequence
        if seq.count('N') > seq_max_N:
            continue
        qual = qual1 + qual2[::-1]
        
        yield read1_desc, read1_barcode, seq, qual
Example #7
0
 def test_parse_illumina_line_barcode_in_sequence(self):
     """parse_illumina_line: handles barcode in sequence correctly """
     illumina_line0 = illumina_read3[0]
     actual = parse_illumina_line(
      illumina_line0,barcode_length=12,
      rev_comp_barcode=False,barcode_in_sequence=True)
     expected = {\
      'Full description':'HWI-EAS440_0386:1:23:19516:1031#0/1',
      'Machine Name':'HWI-EAS440_0386',
      'Channel Number':1,
      'Tile Number':23,
      'X Position':19516,
      'Y Position':1031,
      'Barcode':'ACAGCTAGCTTG',
      'Full Y Position Field':'1031#0/1',
      'Sequence':
       'TACGNAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATT'
       'GTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGT'
       'ACAGTAGAGGCAGGCGGAATTCGTGGGG',
      'Quality Score':
       'fffcGddd\_``_gggggggggggfgggggegggggcgggggggggggeeffafdcdfgbdggdbe]fbf'
       'dddddbdadadcddaf`abb`cVNRNUScaa``aOY]]]_[_BBBBBBBBBBB'
       'BBBBBBBBBBBBBBBBBBBBBBBBBBBBB'}
     self.assertEqual(actual,expected)