def test_homopolymer_even_odd(self): ref = 'ATCGAGAT' + 'A' * 15 + 'TCGAGAT' read = MockRead( 'name', 1, 1, query_sequence='ATCGAGATA' + 'A' * 12 + 'TCGAGAT', cigar=[(CIGAR.EQ, 8), (CIGAR.D, 2), (CIGAR.EQ, 20)], ) self.assertEqual([(CIGAR.EQ, 9 + 12), (CIGAR.D, 2), (CIGAR.EQ, 7)], hgvs_standardize_cigar(read, ref)) ref = ( 'CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA' 'TGAGGCAGAGACAATGTGGGGAGCGAGAGAGGGGAAAAGGACGGGGGAGG') read = MockRead( 'name', '1', 0, 149, query_sequence= ('CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA' 'TGAGGCAGAGACAATGTGGGGAGCGAGAGAGGGGAAAAGGACGGGGGAGG'), cigar=[(CIGAR.EQ, 61), (CIGAR.I, 2), (CIGAR.EQ, 87)], ) self.assertEqual( [(CIGAR.EQ, 61 + 15), (CIGAR.I, 2), (CIGAR.EQ, 87 - 15)], hgvs_standardize_cigar(read, ref), ) ref = ( 'CCTCCTCGGTCGGGCAGATCTTTCAGAAGCAGGAGCCCAGGATCATGTCTGGTTTTGTTTTCCGAGGGCGAGGGGGCTCCCTGAGGATGATGGTGATTT' 'TTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAATGAGGCAGAGACA') read = MockRead( 'name', '1', 0, 149, query_sequence= ('CCCCTCCTCGGTCGGGCAGATCTTTCAGAAGCAGGAGCCCAGGATCATGTCTGGTTTTGTTTTCCGAGGGCGAGGGGGCTCCCTGAGGATGATGGTGATTTT' 'TTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAATGAGGCAGAGACA'), cigar=[(CIGAR.S, 2), (CIGAR.EQ, 96), (CIGAR.I, 2), (CIGAR.EQ, 50)], ) self.assertEqual( [(CIGAR.S, 2), (CIGAR.EQ, 96 + 15), (CIGAR.I, 2), (CIGAR.EQ, 50 - 15)], hgvs_standardize_cigar(read, ref), )
def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG' # original # ATAGGCATCTACG AA CGAACGAGATCGCTACG # ATCTC TTT TTCGAACG # expected # ATAGGCATCT ACGAACGAACGAGATCGCTACG # ATCTCTTTTT CGAACG read = MockRead( 'name', 1, 6, reference_name='1', query_sequence='ATCTCTTTTTCGAACG', cigar=[ (CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.X, 1), (CIGAR.D, 2), (CIGAR.I, 3), (CIGAR.D, 2), (CIGAR.I, 2), (CIGAR.EQ, 6), ], ) print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)})) print(SamRead.insertion_sequences(read)) print(read.query_sequence, len(read.query_sequence)) self.assertEqual( [(CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.I, 6), (CIGAR.D, 5), (CIGAR.EQ, 6)], hgvs_standardize_cigar(read, ref), )
def test_complex(self): qseq = ( 'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'C' 'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT' 'TCTCTCTCTCTCTCTCTCTCTCTCTC' 'TCTATATATATATATATA' 'T' 'A' 'T' 'C' 'T' 'ACACACACACACACACAC') rseq = ( 'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'T' 'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT' 'TCTATATATATATATATA' 'C' 'A' 'C' 'ACACACACACACACACAC') read = MockRead( 'name', reference_name='mock', reference_start=0, query_sequence=qseq, cigar=[ (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 49), (CIGAR.I, 26), (CIGAR.EQ, 18), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)] ) print(rseq) print(read.query_sequence[:83], read.query_sequence[83 + 26: 83 + 26 + 20], read.query_sequence[83 + 26 + 22:]) print(read.query_sequence) print(SamRead.insertion_sequences(read)) new_cigar = [ (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 52), (CIGAR.I, 26), (CIGAR.EQ, 15), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)] std_cigar = hgvs_standardize_cigar(read, rseq) print(new_cigar) print(std_cigar) self.assertEqual(new_cigar, std_cigar)
def test_deletion_repeat(self): qseq = ( 'GAGT' 'GAGACTCTGT' 'GAA' 'AAAGAAAAAAAAAA' 'A' 'ATATATATATATATAAATATA' 'C' 'ATATTATGTATCAAATATATAT' 'TATGTGTAATATACATCATGTATCAAATATATATTATGTATAATATACATCATATATCAAATATATATTATGTG' ) # deleted reference: TATGTGTAATATACATCATGTATCAAA print(qseq[:76], qseq[76:]) read = MockRead('name', reference_name='11_86018001-86018500', reference_start=28, cigar=[(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3), (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21), (CIGAR.X, 1), (CIGAR.EQ, 22), (CIGAR.D, 27), (CIGAR.EQ, 74)], query_sequence=qseq) expected_cigar = [(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3), (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21), (CIGAR.X, 1), (CIGAR.EQ, 22 + 30), (CIGAR.D, 27), (CIGAR.EQ, 74 - 30)] std_cigar = hgvs_standardize_cigar( read, REFERENCE_GENOME[read.reference_name].seq) print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) read.cigar = std_cigar print(SamRead.deletion_sequences(read, REFERENCE_GENOME)) self.assertEqual(expected_cigar, std_cigar)
def no_change_aligned(self): ref = 'AAATTTGGGCCCAATT' read = MockRead('name', '1', 1, cigar=[(CIGAR.M, 10)], query_sequence='AAATTTGGGC') self.assertEqual([(CIGAR.M, 10)], hgvs_standardize_cigar(read, ref))
def test_deletion_partial_repeat(self): qseq = ('ATCTTAGCCAGGT' 'AGTTACATACATATC') rseq = ('ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC') read = MockRead( 'name', reference_name='mock', reference_start=0, query_sequence=qseq, cigar=convert_string_to_cigar('13=6D15=') ) self.assertEqual(convert_string_to_cigar('15=6D13='), hgvs_standardize_cigar(read, rseq))
def test_deletion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG' read = MockRead( 'name', 1, 6, query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', cigar=[(CIGAR.EQ, 4), (CIGAR.D, 4), (CIGAR.EQ, 12)] ) self.assertEqual([(CIGAR.EQ, 12), (CIGAR.D, 4), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref))
def test_bubble_sort_indel_sections(self): rseq = 'ATAGGC' 'ATCT' 'GG' 'GA' 'GCGA' 'GATCGCTACG' qseq = 'ATCT' 'TTT' 'TT' 'GCGA' 'GATC' read = MockRead('name', 1, 6, query_sequence=qseq, cigar=[(CIGAR.EQ, 4), (CIGAR.D, 2), (CIGAR.I, 3), (CIGAR.D, 2), (CIGAR.I, 2), (CIGAR.EQ, 8)]) self.assertEqual([(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 4), (CIGAR.EQ, 8)], hgvs_standardize_cigar(read, rseq))
def no_change_proper_indel(self): ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' read = MockRead( 'name', 1, 6, query_sequence='ATCTAC' 'CCC' 'ATCG', cigar=[(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)] ) self.assertEqual( [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref))
def ins_after_deletion(self): ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG' read = MockRead( 'name', 1, 6, query_sequence='ATCTAC' 'CCC' 'ATCG', cigar=[(CIGAR.EQ, 6), (CIGAR.D, 3), (CIGAR.I, 3), (CIGAR.EQ, 4)] ) self.assertEqual( [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref))
def test_unecessary_indel_end_match2(self): rseq = 'GGGTGCAGTGGCTTACACCT' 'GTAATCCAAACACCTTGGGAGCCGCCCCCTGAG' 'CCTCCAGGCCCGGGACAGA' qseq = 'GGGTGCAGTGGCTTACACCT' 'CCAGG' 'CCTCCAGGCCCGGGACAGA' read = MockRead('name', reference_name='1', reference_start=0, cigar=convert_string_to_cigar('20=5I33D19='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('20=4I32D20=') new_cigar = hgvs_standardize_cigar(read, rseq) self.assertEqual(exp, new_cigar)
def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkmkghjklzxcvbnm' read = MockRead('name', reference_name='1', reference_start=0, cigar=convert_string_to_cigar('14=5I2D10='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('14=3I12=') new_cigar = hgvs_standardize_cigar(read, rseq) self.assertEqual(exp, new_cigar)
def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkmkghjklzxcvbnm' read = MockRead( 'name', reference_name='1', reference_start=0, cigar=convert_string_to_cigar('14=5I2D10='), query_sequence=qseq, ) exp = convert_string_to_cigar('14=3I12=') new_cigar = hgvs_standardize_cigar(read, rseq) assert new_cigar == exp
def test_bubble_sort_indel_sections_drop_mismatch(self): rseq = 'ATAGGC' 'ATCT' 'A' 'CGA' 'AGCAT' 'ACGA' 'GATCGCTACG' # ATCT CTTTT TACGA qseq = 'ATCT' 'C' 'TT' 'TTT' 'ACGA' 'GATC' read = MockRead('name', 1, 6, query_sequence=qseq, cigar=[(CIGAR.EQ, 4), (CIGAR.X, 1), (CIGAR.D, 3), (CIGAR.I, 2), (CIGAR.D, 5), (CIGAR.I, 3), (CIGAR.EQ, 8)]) self.assertEqual([(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 8), (CIGAR.EQ, 9)], hgvs_standardize_cigar(read, rseq))
def test_indel_repeat(self): qseq = 'ATCTTAGCCAGGT' 'C' 'AGTTACATACATATC' rseq = 'ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC' print(qseq) print(rseq) read = MockRead( 'name', reference_name='mock', reference_start=0, query_sequence=qseq, cigar=convert_string_to_cigar('13=1I6D15='), ) self.assertEqual(convert_string_to_cigar('13=1I6D15='), hgvs_standardize_cigar(read, rseq))
def test_insertion_in_repeat(self): ref = 'ATAGGC' 'ATCT' 'ACGA' 'GATCGCTACG' read = MockRead( 'name', 1, 6, query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC', cigar=[(CIGAR.EQ, 4), (CIGAR.I, 4), (CIGAR.EQ, 8)], ) assert [(CIGAR.EQ, 8), (CIGAR.I, 4), (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref)
def test_even_insertion_in_repeat(self): rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' print(len(qseq) - 13 - 4) read = MockRead('name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2I66='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S26=2I53=') new_cigar = hgvs_standardize_cigar(read, rseq) read.cigar = new_cigar self.assertEqual(exp, new_cigar)
def test_odd_deletion_in_repeat(self): rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'ATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' print(len(qseq) - 28) read = MockRead('name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=3D63='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S24=3D52=') new_cigar = hgvs_standardize_cigar(read, rseq) print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) self.assertEqual(exp, new_cigar)
def test_shift_complex_indel(self): refseq = 'ATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAATGCTTTCTTTTATTAGTTGCCCTGTTTCAGATTCAGCTTTGTATCTATATCACCTGTTAATATGTGTGGACTCACAGAAATGATCATTGAGGGAATGCACCCTGTTTGGGTGTAAGTAGCTCAGGGAAAAAATCCTAG' read = MockRead( 'name', reference_name='18', reference_start=40237946 - 40237890, query_sequence= 'AGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTGTATATATATATATATATATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAGTGCTTTCTTTTATTAGTGGCCCTG', cigar=convert_string_to_cigar('44M18I88M'), ) print(_read.convert_cigar_to_string(read.cigar)) read.cigar = recompute_cigar_mismatch(read, refseq) assert read.cigar == convert_string_to_cigar('44=18I63=1X17=1X6=') print(_read.convert_cigar_to_string(read.cigar)) read.cigar = hgvs_standardize_cigar(read, refseq) print(_read.convert_cigar_to_string(read.cigar)) assert read.cigar == convert_string_to_cigar('45=18I62=1X17=1X6=')
def test_even_insertion_in_repeat(self): rseq = ('AAAGAAAAAAAAAAAAT' 'ATATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') qseq = ('TTTTAAAAAAAAAAAAT' 'ATATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') print(len(qseq) - 13 - 4) read = MockRead( 'name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2I66='), query_sequence=qseq, ) exp = convert_string_to_cigar('4S26=2I53=') new_cigar = hgvs_standardize_cigar(read, rseq) read.cigar = new_cigar assert new_cigar == exp
def test_even_deletion_in_repeat(self): rseq = ('AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') qseq = ('TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') print(len(qseq) - 28) read = MockRead( 'name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2D64='), query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S24=2D53=') new_cigar = hgvs_standardize_cigar(read, rseq) print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) assert new_cigar == exp