Esempio n. 1
0
 def test_inversion_and_deletion(self):
     s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT'
     evidence = MockObject(
         interchromosomal=False,
         opposing_strands=True,
         break1=MockObject(orient=ORIENT.RIGHT, chr='3'),
         break2=MockObject(orient=ORIENT.RIGHT, chr='3'),
         contigs=[MockObject(seq=s, alignments=set())],
         standardize_read=lambda x: x,
         contig_aln_max_event_size=DEFAULTS.contig_aln_max_event_size,
         contig_aln_merge_inner_anchor=5,
         contig_aln_merge_outer_anchor=DEFAULTS.contig_aln_merge_outer_anchor,
         contig_aln_min_query_consumption=0.9,
         contig_aln_min_extend_overlap=DEFAULTS.contig_aln_min_extend_overlap,
         contig_aln_min_anchor_size=DEFAULTS.contig_aln_min_anchor_size,
         contig_aln_min_score=DEFAULTS.contig_aln_min_score,
         outer_window1=Interval(1000, 1200),
         outer_window2=Interval(2000, 2200),
         reference_genome=None,
         bam_cache=mock.Mock(stranded=False)
     )
     read1 = SamRead(
         reference_id=3, reference_start=1114, cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], query_sequence=s,
         is_reverse=False, reference_name='3', alignment_rank=0
     )
     read2 = SamRead(
         reference_id=3, reference_start=2187, cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.EQ, 120)],
         query_sequence=reverse_complement(s), is_reverse=True, reference_name='3', alignment_rank=1
     )
     raw_alignments = {s: [read1, read2]}
     align.select_contig_alignments(evidence, raw_alignments)
     alignments = list(evidence.contigs[0].alignments)
     self.assertEqual(2, len(alignments))
Esempio n. 2
0
 def test_deletion_repeat(self):
     qseq = (
         'GAGT'
         'GAGACTCTGT'
         'GAA'
         'AAAGAAAAAAAAAA'
         'A'
         'ATATATATATATATAAATATA'
         'C'
         'ATATTATGTATCAAATATATAT'
         'TATGTGTAATATACATCATGTATCAAATATATATTATGTATAATATACATCATATATCAAATATATATTATGTG'
     )
     # deleted reference: TATGTGTAATATACATCATGTATCAAA
     print(qseq[:76], qseq[76:])
     read = MockRead('name',
                     reference_name='11_86018001-86018500',
                     reference_start=28,
                     cigar=[(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3),
                            (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21),
                            (CIGAR.X, 1), (CIGAR.EQ, 22), (CIGAR.D, 27),
                            (CIGAR.EQ, 74)],
                     query_sequence=qseq)
     expected_cigar = [(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3),
                       (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21),
                       (CIGAR.X, 1), (CIGAR.EQ, 22 + 30), (CIGAR.D, 27),
                       (CIGAR.EQ, 74 - 30)]
     std_cigar = hgvs_standardize_cigar(
         read, REFERENCE_GENOME[read.reference_name].seq)
     print(SamRead.deletion_sequences(read, REFERENCE_GENOME))
     read.cigar = std_cigar
     print(SamRead.deletion_sequences(read, REFERENCE_GENOME))
     self.assertEqual(expected_cigar, std_cigar)
Esempio n. 3
0
 def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self):
     ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG'
     # original
     # ATAGGCATCTACG   AA  CGAACGAGATCGCTACG
     #       ATCTC  TTT  TTCGAACG
     # expected
     # ATAGGCATCT      ACGAACGAACGAGATCGCTACG
     #       ATCTCTTTTT     CGAACG
     read = MockRead(
         'name',
         1,
         6,
         reference_name='1',
         query_sequence='ATCTCTTTTTCGAACG',
         cigar=[
             (CIGAR.H, 10),
             (CIGAR.EQ, 4),
             (CIGAR.X, 1),
             (CIGAR.D, 2),
             (CIGAR.I, 3),
             (CIGAR.D, 2),
             (CIGAR.I, 2),
             (CIGAR.EQ, 6),
         ],
     )
     print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)}))
     print(SamRead.insertion_sequences(read))
     print(read.query_sequence, len(read.query_sequence))
     self.assertEqual(
         [(CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.I, 6), (CIGAR.D, 5),
          (CIGAR.EQ, 6)],
         hgvs_standardize_cigar(read, ref),
     )
Esempio n. 4
0
 def test_shift_overaligned(self):
     # qwertyuiopas---kkkkk------dfghjklzxcvbnm
     # ..........      ................
     gene = Gene('1', 1, 1000, strand='+')
     transcript = PreTranscript(exons=[(1, 12), (20, 28)], gene=gene, strand='+')
     for spl_patt in transcript.generate_splicing_patterns():
         transcript.transcripts.append(Transcript(transcript, spl_patt))
     gene.transcripts.append(transcript)
     read = SamRead(
         reference_name='1',
         reference_start=0,
         cigar=_cigar.convert_string_to_cigar('14=7D12='),
         query_sequence='qwertyuiopasdfghjklzxcvbnm',
     )
     evidence = TranscriptomeEvidence(
         annotations={},
         reference_genome={'1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm')},
         bam_cache=MockObject(get_read_reference_name=lambda r: r.reference_name),
         break1=Breakpoint('1', 1, orient='L', strand='+'),
         break2=Breakpoint('1', 10, orient='R', strand='+'),
         read_length=75,
         stdev_fragment_size=75,
         median_fragment_size=220,
     )
     evidence.overlapping_transcripts.add(transcript)
     new_read = evidence.standardize_read(read)
     assert new_read.cigar == _cigar.convert_string_to_cigar('12=7N14=')
Esempio n. 5
0
 def test_insertions(self):
     exp = ['kkk', 'kkkk']
     read = MockRead(reference_start=0,
                     reference_name='1',
                     query_sequence='abcdekkkfghijklmnopqkkkkrstuvwxyz',
                     cigar=convert_string_to_cigar('5=3I12=4I9='))
     self.assertEqual(exp, SamRead.insertion_sequences(read))
Esempio n. 6
0
 def test_deletions(self):
     exp = ['cde', 'nopq']
     read = MockRead(
         reference_start=0, reference_name='1', query_sequence='',
         cigar=convert_string_to_cigar('2=3D8=4D9=')
     )
     self.assertEqual(exp, SamRead.deletion_sequences(read, self.reference_genome))
Esempio n. 7
0
 def test_odd_deletion_in_repeat(self):
     rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC'
     qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'ATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC'
     print(len(qseq) - 28)
     read = MockRead('name',
                     reference_name='1',
                     reference_start=4,
                     cigar=convert_string_to_cigar('4S13=3D63='),
                     query_sequence=qseq)
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('4S24=3D52=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     print(SamRead.deletion_sequences(read, reference_genome))
     read.cigar = new_cigar
     print(SamRead.deletion_sequences(read, reference_genome))
     self.assertEqual(exp, new_cigar)
Esempio n. 8
0
 def test_complex(self):
     qseq = (
         'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'C'
         'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT'
         'TCTCTCTCTCTCTCTCTCTCTCTCTC'
         'TCTATATATATATATATA'
         'T' 'A' 'T' 'C' 'T'
         'ACACACACACACACACAC')
     rseq = (
         'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'T'
         'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT'
         'TCTATATATATATATATA'
         'C' 'A' 'C'
         'ACACACACACACACACAC')
     read = MockRead(
         'name', reference_name='mock', reference_start=0, query_sequence=qseq,
         cigar=[
             (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 49), (CIGAR.I, 26),
             (CIGAR.EQ, 18), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1),
             (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)]
     )
     print(rseq)
     print(read.query_sequence[:83], read.query_sequence[83 + 26: 83 + 26 + 20], read.query_sequence[83 + 26 + 22:])
     print(read.query_sequence)
     print(SamRead.insertion_sequences(read))
     new_cigar = [
         (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 52), (CIGAR.I, 26),
         (CIGAR.EQ, 15), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1),
         (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)]
     std_cigar = hgvs_standardize_cigar(read, rseq)
     print(new_cigar)
     print(std_cigar)
     self.assertEqual(new_cigar, std_cigar)
Esempio n. 9
0
 def test_bwa_mem(self):
     # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG
     # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG
     # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='')
     read = SamRead(reference_name='1')
     read.query_sequence = 'TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG'
     read.reference_start = 224646710
     read.reference_id = 0
     print(_cigar.convert_string_to_cigar('183=12D19='))
     read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19='))
     read.query_name = 'name'
     read.mapping_quality = NA_MAPPING_QUALITY
     std_read = Evidence.standardize_read(self.mock_evidence, read)
     print(SamRead.__repr__(read))
     print(SamRead.__repr__(std_read))
     self.assertEqual(_cigar.convert_string_to_cigar('186=12D16='), std_read.cigar)
     self.assertEqual(read.reference_start, std_read.reference_start)
Esempio n. 10
0
 def test_deletions(self):
     exp = ['cde', 'nopq']
     read = MockRead(
         reference_start=0,
         reference_name='1',
         query_sequence='',
         cigar=convert_string_to_cigar('2=3D8=4D9='),
     )
     assert (SamRead.deletion_sequences(
         read, {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')}) == exp)
Esempio n. 11
0
 def test_even_deletion_in_repeat(self):
     rseq = ('AAAGAAAAAAAAAAAAT'
             'ATATATATATA'
             'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC')
     qseq = ('TTTTAAAAAAAAAAAAT'
             'ATATATATATA'
             'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC')
     print(len(qseq) - 28)
     read = MockRead(
         'name',
         reference_name='1',
         reference_start=4,
         cigar=convert_string_to_cigar('4S13=2D64='),
         query_sequence=qseq,
     )
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('4S24=2D53=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     print(SamRead.deletion_sequences(read, reference_genome))
     read.cigar = new_cigar
     print(SamRead.deletion_sequences(read, reference_genome))
     assert new_cigar == exp
Esempio n. 12
0
 def test_shift_no_transcripts(self):
     read = SamRead(reference_name='1',
                    reference_start=0,
                    cigar=_cigar.convert_string_to_cigar('14=7D18='),
                    query_sequence='qwertyuiopasdfdfghjklzxcvbnm')
     evidence = TranscriptomeEvidence(
         annotations={},
         reference_genome={
             '1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm')
         },
         bam_cache=None,
         break1=Breakpoint('1', 1, orient='L', strand='+'),
         break2=Breakpoint('1', 10, orient='R', strand='+'),
         read_length=75,
         stdev_fragment_size=75,
         median_fragment_size=220)
     new_cigar = evidence.exon_boundary_shift_cigar(read)
     self.assertEqual(_cigar.convert_string_to_cigar('14=7D18='), new_cigar)
Esempio n. 13
0
 def test_bwa_mem(self):
     mock_evidence = MockObject(
         reference_genome={
             '1': MockObject(
                 seq=MockLongString(
                     'TGGGTATCAGACACACTGGGTAGCTGAGTGCTCAGAGGAAGATGCGAGGTATTCAGGGAAAGTGTCAGTGGGGTCTCCCAGTGCCTGTTTGGTCCACAGTTAGGAGA'
                     'GGCCCTGCTTGCACTTCTAATACAGTCCCGGAAAGACGGGGCCAGAACTTAGGAGGGGAGCGCTTTGCAGCAACTTTTCAAGAAAAGGGGAAAATTTAAGCACCATA'
                     'CTGTTATGTGGTCCTTGTACCCAGAGGCCCTGTTCAGCTCCAGTGATCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGT'
                     'GTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCT'
                     'TTCTTCCTTCTACTGCTTAGATCAAGTCTTCAGCAGACATCATGTGACCTTGAGGATGGATGTCACATGCTGGAGGAAACAGAAGGCCGAAACCCTGATGACTTCAC'
                     'AGAGCTGCCAAAACAGTTCCTGACTGTTTATTCCGGGTCTTTAACAAAGTGATGAAAAGAAATCCTTGCAGTATGAAAACAACTTTTCTATTCCATGGAGCCAAACC'
                     'TCATTATAACAGATAACGTGACCCTCAGCGATATCCCAAGTATTTTCCTGTTCTCATCTATACTATGGCAAAGGGGCAAATACCTCTCAGTAAAGAAAGAAATAACA'
                     'ACTTCTATCTTGGGCGAGGCATTTCTTCTGTTAGAACTTTGTACACGGAATAAAATAGATCTGTTTGTGCTTATCTTTCTCCTTAGAATTATTGAATTTGAAGTCTT'
                     'TCCCAGGGTGGGGGTGGAGTGAAGCTGGGGTTTCATAAGCACATAGATAGTAGTG',
                     offset=224646450,
                 )
             )
         },
         bam_cache=MockObject(get_read_reference_name=lambda x: x.reference_name),
         config={
             'validate.contig_aln_merge_inner_anchor': 10,
             'validate.contig_aln_merge_outer_anchor': 20,
             **DEFAULTS,
         },
     )
     # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG
     # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG
     # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='')
     read = SamRead(reference_name='1')
     read.query_sequence = 'TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG'
     read.reference_start = 224646710
     read.reference_id = 0
     print(_cigar.convert_string_to_cigar('183=12D19='))
     read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19='))
     read.query_name = 'name'
     read.mapping_quality = NA_MAPPING_QUALITY
     std_read = Evidence.standardize_read(mock_evidence, read)
     assert std_read.cigar == _cigar.convert_string_to_cigar('186=12D16=')
     assert std_read.reference_start == read.reference_start
Esempio n. 14
0
    def test_hardclipping(self):
        read = SamRead(reference_name='15')
        read.reference_start = 71491944
        read.cigar = _cigar.convert_string_to_cigar('12=1D25=113H')
        read.query_sequence = 'GTGTGTGGTGTGGGGTGTGTGGTGTGTGTGGTGTGTG'
        read.is_reverse = True

        expected_bpp = BreakpointPair(
            Breakpoint('15', 71491956, orient='L', strand='-'),
            Breakpoint('15', 71491958, orient='R', strand='-'),
            untemplated_seq='')
        events = align.call_read_events(read, is_stranded=True)
        self.assertEqual(1, len(events))
        self.assertEqual(expected_bpp.break1, events[0].break1)
        self.assertEqual(expected_bpp.break2, events[0].break2)