def test_parse_illumina_line(self): """parse_illumina_line: functions with several lines """ illumina_line0 = illumina_read1[0] illumina_line1 = illumina_read1[1] actual = parse_illumina_line(illumina_line0, barcode_length=6, rev_comp_barcode=True) expected = {\ 'Full description':'HWI-6X_9267:1:1:4:1699#ACCACCC/1',\ 'Machine Name':'HWI-6X_9267',\ 'Channel Number':1,\ 'Tile Number':1,\ 'X Position':4,\ 'Y Position':1699,\ 'Barcode':'GGTGGT',\ 'Full Y Position Field':'1699#ACCACCC/1',\ 'Sequence':\ 'TACGGAGGGTGCGAGCGTTAATCGCCCCCCCCCCCCCCCCCCCCCCCCCCCC'+\ 'CCCCCCCCCCCCCCCCCCCCCCCGAAAAAAAAAAAAAAAAAAAAAAA',\ 'Quality Score':\ 'abbbbbbbbbb`_`bbbbbb`bb^aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\ 'aaaaaaaaaaaaaDaabbBBBBBBBBBBBBBBBBBBB'} self.assertEqual(actual, expected) actual = parse_illumina_line(illumina_line0, barcode_length=6, rev_comp_barcode=False) expected['Barcode'] = 'ACCACC' actual = parse_illumina_line(illumina_line1, barcode_length=6, rev_comp_barcode=True) expected = {\ 'Full description':'HWI-6X_9267:1:1:4:390#ACCTCCC/1',\ 'Machine Name':'HWI-6X_9267',\ 'Channel Number':1,\ 'Tile Number':1,\ 'X Position':4,\ 'Y Position':390,\ 'Barcode':'GGAGGT',\ 'Full Y Position Field':'390#ACCTCCC/1',\ 'Sequence':\ 'GACAGGAGGAGCAAGTGTTATTCAAATTATGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGG'+\ 'GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAA',\ 'Quality Score':\ 'aaaaaaaaaa```aa\^_aa``aVaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\ 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaBaaaaa'} self.assertEqual(actual, expected) actual = parse_illumina_line(illumina_line1, barcode_length=6, rev_comp_barcode=False) expected['Barcode'] = 'ACCTCC'
def test_parse_illumina_line_barcode_in_header(self): """parse_illumina_line: handles barcode in header correctly """ illumina_line0 = illumina_read1[0] illumina_line1 = illumina_read1[1] actual = parse_illumina_line( illumina_line0,barcode_length=6,rev_comp_barcode=True) expected = {\ 'Full description':'HWI-6X_9267:1:1:4:1699#ACCACCC/1',\ 'Machine Name':'HWI-6X_9267',\ 'Channel Number':1,\ 'Tile Number':1,\ 'X Position':4,\ 'Y Position':1699,\ 'Barcode':'GGTGGT',\ 'Full Y Position Field':'1699#ACCACCC/1',\ 'Sequence':\ 'TACGGAGGGTGCGAGCGTTAATCGCCCCCCCCCCCCCCCCCCCCCCCCCCCC'+\ 'CCCCCCCCCCCCCCCCCCCCCCCGAAAAAAAAAAAAAAAAAAAAAAA',\ 'Quality Score':\ 'abbbbbbbbbb`_`bbbbbb`bb^aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\ 'aaaaaaaaaaaaaDaabbBBBBBBBBBBBBBBBBBBB'} self.assertEqual(actual,expected) actual = parse_illumina_line( illumina_line0,barcode_length=6,rev_comp_barcode=False) expected['Barcode'] = 'ACCACC' actual = parse_illumina_line( illumina_line1,barcode_length=6,rev_comp_barcode=True) expected = {\ 'Full description':'HWI-6X_9267:1:1:4:390#ACCTCCC/1',\ 'Machine Name':'HWI-6X_9267',\ 'Channel Number':1,\ 'Tile Number':1,\ 'X Position':4,\ 'Y Position':390,\ 'Barcode':'GGAGGT',\ 'Full Y Position Field':'390#ACCTCCC/1',\ 'Sequence':\ 'GACAGGAGGAGCAAGTGTTATTCAAATTATGCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGG'+\ 'GGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAA',\ 'Quality Score':\ 'aaaaaaaaaa```aa\^_aa``aVaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'+\ 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaBaaaaa'} self.assertEqual(actual,expected) actual = parse_illumina_line( illumina_line1,barcode_length=6,rev_comp_barcode=False) expected['Barcode'] = 'ACCTCC'
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\ barcode_max_N=0,seq_max_N=0): """Parses Illumina paired-end read file pair """ for read1_line, read2_line in izip(read1_file, read2_file): read1 = parse_illumina_line(read1_line, barcode_length, rev_comp_barcode) read2 = parse_illumina_line(read2_line, barcode_length, rev_comp_barcode) read1_desc = illumina_read_description_from_read_data(read1) read2_desc = illumina_read_description_from_read_data(read2) read1_barcode = read1['Barcode'] read2_barcode = read2['Barcode'] if (read1_barcode.count('N') > barcode_max_N) or \ (read2_barcode.count('N') > barcode_max_N): continue if read1_desc != read2_desc: raise IlluminaParseError, \ "Error in sequence files, descriptions of"+\ " corresponding lines are not compatible: %s != %s" %\ (read1_desc, read2_desc) assert read1_barcode == read2_barcode seq1, qual1 = read_qual_score_filter(\ read1['Sequence'], read1['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq1) < min_per_read_length): continue seq2, qual2 = read_qual_score_filter(\ read2['Sequence'], read2['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq2) < min_per_read_length): continue seq = seq1 + revComp(seq2) # If the total number of Ns is more than the max # allowed ignore this sequence if seq.count('N') > seq_max_N: continue qual = qual1 + qual2[::-1] yield read1_desc, read1_barcode, seq, qual
def parse_illumina_single_end_read_file(read_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length, rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0): """Parses Illumina single-end read file """ for read_line in read_file: read = parse_illumina_line(read_line, barcode_length, rev_comp_barcode, barcode_in_seq) read_desc = illumina_read_description_from_read_data(read) read_barcode = read['Barcode'] if read_barcode.count('N') > barcode_max_N: continue seq, qual = read_qual_score_filter(\ read['Sequence'], read['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N): continue if rev_comp: seq = revComp(seq) qual = qual[::-1] yield read_desc, read_barcode, seq, qual
def parse_illumina_single_end_read_file(read_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length, rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0): """Parses Illumina single-end read file """ for read_line in read_file: read = parse_illumina_line(read_line,barcode_length, rev_comp_barcode,barcode_in_seq) read_desc = illumina_read_description_from_read_data(read) read_barcode = read['Barcode'] if read_barcode.count('N') > barcode_max_N: continue seq, qual = read_qual_score_filter(\ read['Sequence'], read['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N): continue if rev_comp: seq = DNA.rc(seq) qual = qual[::-1] yield read_desc, read_barcode, seq, qual
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\ barcode_max_N=0,seq_max_N=0): """Parses Illumina paired-end read file pair """ for read1_line, read2_line in izip(read1_file,read2_file): read1 = parse_illumina_line(read1_line,barcode_length,rev_comp_barcode) read2 = parse_illumina_line(read2_line,barcode_length,rev_comp_barcode) read1_desc = illumina_read_description_from_read_data(read1) read2_desc = illumina_read_description_from_read_data(read2) read1_barcode = read1['Barcode'] read2_barcode = read2['Barcode'] if (read1_barcode.count('N') > barcode_max_N) or \ (read2_barcode.count('N') > barcode_max_N): continue if read1_desc != read2_desc: raise IlluminaParseError, \ "Error in sequence files, descriptions of"+\ " corresponding lines are not compatible: %s != %s" %\ (read1_desc, read2_desc) assert read1_barcode == read2_barcode seq1, qual1 = read_qual_score_filter(\ read1['Sequence'], read1['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq1) < min_per_read_length): continue seq2, qual2 = read_qual_score_filter(\ read2['Sequence'], read2['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq2) < min_per_read_length): continue seq = seq1 + DNA.rc(seq2) # If the total number of Ns is more than the max # allowed ignore this sequence if seq.count('N') > seq_max_N: continue qual = qual1 + qual2[::-1] yield read1_desc, read1_barcode, seq, qual
def test_parse_illumina_line_barcode_in_sequence(self): """parse_illumina_line: handles barcode in sequence correctly """ illumina_line0 = illumina_read3[0] actual = parse_illumina_line( illumina_line0,barcode_length=12, rev_comp_barcode=False,barcode_in_sequence=True) expected = {\ 'Full description':'HWI-EAS440_0386:1:23:19516:1031#0/1', 'Machine Name':'HWI-EAS440_0386', 'Channel Number':1, 'Tile Number':23, 'X Position':19516, 'Y Position':1031, 'Barcode':'ACAGCTAGCTTG', 'Full Y Position Field':'1031#0/1', 'Sequence': 'TACGNAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATT' 'GTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGGTGTCTTGAGT' 'ACAGTAGAGGCAGGCGGAATTCGTGGGG', 'Quality Score': 'fffcGddd\_``_gggggggggggfgggggegggggcgggggggggggeeffafdcdfgbdggdbe]fbf' 'dddddbdadadcddaf`abb`cVNRNUScaa``aOY]]]_[_BBBBBBBBBBB' 'BBBBBBBBBBBBBBBBBBBBBBBBBBBBB'} self.assertEqual(actual,expected)