def test_read_pair_inversion_overlapping_query_coverage(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT # r1 AAATTTCCCGGGAATTCCGGAtcgatcgat + # r2c aaatttcccgggaattccGGATCGATCGAT - # i ------------------GGA--------- # r2 ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108 seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead( reference_id=0, reference_name='1', reference_start=0, cigar=[(CIGAR.M, 21), (CIGAR.S, 9)], query_sequence=seq, is_reverse=False, ) r2 = MockRead( reference_id=0, reference_name='1', reference_start=99, cigar=[(CIGAR.M, 12), (CIGAR.S, 18)], query_sequence=reverse_complement(seq), is_reverse=True, ) bpp = align.call_paired_read_event(r1, r2, is_stranded=True) assert bpp.break1.strand == STRAND.POS assert bpp.break2.strand == STRAND.NEG assert bpp.break1.orient == ORIENT.LEFT assert bpp.break2.orient == ORIENT.LEFT assert bpp.untemplated_seq == '' assert bpp.break1.start == 21 assert bpp.break2.start == 108 assert bpp.break1.seq == 'AAATTTCCCGGGAATTCCGGA' assert bpp.break2.seq == reverse_complement('TCGATCGAT')
def test_read_pair_inversion_gap_in_query_coverage(self): # seq AAATTTCCCGGGAATTCCGGATCGATCGAT # r1 AAATTTCCCGGGAATTccggatcgatcgat + # r2c aaatttcccgggaattccGGATCGATCGAT - # i ----------------CC------------ # r2 ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108 seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT' # 30 r1 = MockRead(reference_id=0, reference_name='1', reference_start=0, cigar=[(CIGAR.M, 16), (CIGAR.S, 14)], query_sequence=seq, is_reverse=False) r2 = MockRead(reference_id=0, reference_name='1', reference_start=99, cigar=[(CIGAR.M, 12), (CIGAR.S, 18)], query_sequence=reverse_complement(seq), is_reverse=True) bpp = align.call_paired_read_event(r1, r2) self.assertEqual(STRAND.POS, bpp.break1.strand) self.assertEqual(STRAND.NEG, bpp.break2.strand) self.assertEqual(ORIENT.LEFT, bpp.break1.orient) self.assertEqual(ORIENT.LEFT, bpp.break2.orient) self.assertEqual('CC', bpp.untemplated_seq) self.assertEqual(16, bpp.break1.start) self.assertEqual(111, bpp.break2.start) self.assertEqual('AAATTTCCCGGGAATT', bpp.break1.seq) self.assertEqual(reverse_complement('GGATCGATCGAT'), bpp.break2.seq)
def test_read_pair_large_inversion_overlapping_query_coverage(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' read1 = MockRead(reference_id=3, reference_start=1114, cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], query_sequence=s, is_reverse=False) read2 = MockRead(reference_id=3, reference_start=2187, cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.M, 120)], query_sequence=reverse_complement(s), is_reverse=True) bpp = align.call_paired_read_event(read1, read2) self.assertEqual(STRAND.POS, bpp.break1.strand) self.assertEqual(STRAND.NEG, bpp.break2.strand) self.assertEqual(ORIENT.RIGHT, bpp.break1.orient) self.assertEqual(ORIENT.RIGHT, bpp.break2.orient) self.assertEqual('', bpp.untemplated_seq) self.assertEqual(1115, bpp.break1.start) self.assertEqual(2188 + 3, bpp.break2.start) print(bpp.break1.seq) print(bpp.break2.seq) self.assertEqual( 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAG' 'GGTTTTCATTTCTGTATGTTAAT', bpp.break1.seq) self.assertEqual( 'GCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCA' 'AATTCTGTGTTTACAGGGCTTTCATGCTCAG', bpp.break2.seq)
def test_blat_contigs_deletion_revcomp(self): ev = GenomeEvidence(Breakpoint('fake', 1714, orient=ORIENT.LEFT), Breakpoint('fake', 2968, orient=ORIENT.RIGHT), opposing_strands=False, bam_cache=BAM_CACHE, reference_genome=REFERENCE_GENOME, read_length=40, stdev_fragment_size=25, median_fragment_size=100) seq = 'GGTATATATTTCTCAGATAAAAGATATTTTCCCTTTTATCTTTCCCTAAGCTCACACTACATATATTGCATTTATCTTATATCTGCTTTAAAACCTATTTAT' \ 'TATGTCATTTAAATATCTAGAAAAGTTATGACTTCACCAGGTATGAAAAATATAAAAAGAACTCTGTCAAGAAT' ev.contigs = [Contig(reverse_complement(seq), 0)] align.select_contig_alignments( ev, align.align_sequences({'seq': ev.contigs[0].seq}, BAM_CACHE, REFERENCE_GENOME, aligner_reference=REFERENCE_GENOME_FILE_2BIT, aligner='blat')) print('alignments:', ev.contigs[0].alignments) alignment = list(ev.contigs[0].alignments)[0] print(alignment) self.assertTrue(alignment.read2 is None) self.assertEqual(0, alignment.read1.reference_id) self.assertTrue(alignment.read1.is_reverse) self.assertEqual(seq, alignment.read1.query_sequence) self.assertEqual(Interval(0, 175), align.query_coverage_interval(alignment.read1)) self.assertEqual(1612, alignment.read1.reference_start) self.assertEqual([(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)], alignment.read1.cigar)
def test_inversion_and_deletion(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' evidence = MockObject( interchromosomal=False, opposing_strands=True, break1=MockObject(orient=ORIENT.RIGHT, chr='3'), break2=MockObject(orient=ORIENT.RIGHT, chr='3'), contigs=[MockObject(seq=s, alignments=set())], standardize_read=lambda x: x, contig_aln_max_event_size=DEFAULTS.contig_aln_max_event_size, contig_aln_merge_inner_anchor=5, contig_aln_merge_outer_anchor=DEFAULTS.contig_aln_merge_outer_anchor, contig_aln_min_query_consumption=0.9, contig_aln_min_extend_overlap=DEFAULTS.contig_aln_min_extend_overlap, contig_aln_min_anchor_size=DEFAULTS.contig_aln_min_anchor_size, contig_aln_min_score=DEFAULTS.contig_aln_min_score, outer_window1=Interval(1000, 1200), outer_window2=Interval(2000, 2200), reference_genome=None, bam_cache=mock.Mock(stranded=False) ) read1 = SamRead( reference_id=3, reference_start=1114, cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], query_sequence=s, is_reverse=False, reference_name='3', alignment_rank=0 ) read2 = SamRead( reference_id=3, reference_start=2187, cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.EQ, 120)], query_sequence=reverse_complement(s), is_reverse=True, reference_name='3', alignment_rank=1 ) raw_alignments = {s: [read1, read2]} align.select_contig_alignments(evidence, raw_alignments) alignments = list(evidence.contigs[0].alignments) self.assertEqual(2, len(alignments))
def test_gimap4_reverse(self): gimap4 = EXAMPLE_GENES['GIMAP4'] gimap4_seq = reverse_complement(gimap4.seq) donors = predict_splice_sites(gimap4_seq, True) for d in donors: assert gimap4_seq[d.start - 1:d.end] == d.seq assert len(donors) == 5
def test_read_pair_large_inversion_overlapping_query_coverage(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' read1 = MockRead( reference_id=3, reference_start=1114, cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], query_sequence=s, is_reverse=False, ) read2 = MockRead( reference_id=3, reference_start=2187, cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.M, 120)], query_sequence=reverse_complement(s), is_reverse=True, ) bpp = align.call_paired_read_event(read1, read2, is_stranded=True) assert bpp.break1.strand == STRAND.POS assert bpp.break2.strand == STRAND.NEG assert bpp.break1.orient == ORIENT.RIGHT assert bpp.break2.orient == ORIENT.RIGHT assert bpp.untemplated_seq == '' assert bpp.break1.start == 1115 assert bpp.break2.start == 2188 + 3 print(bpp.break1.seq) print(bpp.break2.seq) assert ( bpp.break1.seq == 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' ) assert ( bpp.break2.seq == 'GCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' )
def test_gimap4_reverse(self): gimap4 = EXAMPLE_GENES['GIMAP4'] gimap4_seq = reverse_complement(gimap4.seq) donors = predict_splice_sites(gimap4_seq, True) for d in donors: self.assertEqual(d.seq, gimap4_seq[d.start - 1:d.end]) self.assertEqual(5, len(donors))
def test_bwa_contigs(self): ev = GenomeEvidence( Breakpoint('reference3', 1114, orient=ORIENT.RIGHT), Breakpoint('reference3', 2187, orient=ORIENT.RIGHT), opposing_strands=True, bam_cache=BAM_CACHE, reference_genome=REFERENCE_GENOME, read_length=40, stdev_fragment_size=25, median_fragment_size=100, stdev_count_abnormal=2, min_splits_reads_resolution=1, min_flanking_pairs_resolution=1, ) ev.contigs = [ Contig( 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAG' 'TCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTG' 'TTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT', 0, ) ] print(ev.contigs[0].seq) seq = align.align_sequences( {'seq': ev.contigs[0].seq}, BAM_CACHE, REFERENCE_GENOME, aligner_reference=get_data('mock_reference_genome.fa'), aligner='bwa mem', aligner_output_file='mem.out', aligner_fa_input_file='mem.in.fa', ) align.select_contig_alignments(ev, seq) print(ev.contigs[0].alignments) alignment = list(ev.contigs[0].alignments)[0] self.assertEqual(reverse_complement(alignment.read1.query_sequence), alignment.read2.query_sequence) self.assertEqual('reference3', alignment.read1.reference_name) self.assertEqual('reference3', alignment.read2.reference_name) self.assertEqual(1, alignment.read1.reference_id) self.assertEqual(1, alignment.read2.reference_id) self.assertEqual(Interval(125, 244), align.query_coverage_interval(alignment.read1)) self.assertEqual(Interval(117, 244), align.query_coverage_interval(alignment.read2)) self.assertEqual(1114, alignment.read1.reference_start) self.assertEqual(2187, alignment.read2.reference_start) self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], alignment.read1.cigar) self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], alignment.read2.cigar)
def test_simple(self): row = { 'match': 142, 'mismatch': 0, 'repmatch': 0, 'ncount': 0, 'qgap_count': 0, 'qgap_bases': 0, 'tgap_count': 0, 'tgap_bases': 0, 'strand': '-', 'qname': 'seq1', 'qsize': 204, 'qstart': 0, 'qend': 142, 'tname': '17', 'tsize': 81195210, 'tstart': 32673408, 'tend': 32673550, 'block_count': 1, 'block_sizes': [142], 'qstarts': [62], 'tstarts': [32673408], '_index': 880, 'score': 142, 'percent_ident': 100.0, 'qseq_full': ( 'ACATGTGCACAACGTGCAGGTTTGTTACATATGTATACATGTGCCATGTTGGTTTGCTGCACCCATTAACTCGTCCTAGTTTATTACTAGTCTTCAGACATC' 'CAGAAAATAGAGTAAGATACTAGGTAGACATAACACCTAGATACATCCGTAAGGCATTTGTTTCCTATCACATGGCCCATTCTAGCTTAACACCCACCAACT' )} refseq = {'17': Mock(seq=MockLongString( 'ACTAGGTGTTATGTCTACCTAGTATCTTACTCTATTTTCTGGATGTCTGAAGACTAGTAATAAACTAGGACGAGTTAATGGGTGCAGCAAACCAACATGGCACATG' 'TATACATATGTAACAAACCTGCACGTTGTGCACATGTACCCTAAAACTTAAAGTATAAAAAAAAATTTCACTGAGCATAAGACTTCAGACACAAAAGAGTGCATGC' 'CATATAATTCCATTTATGTGAATTTCAAGAACAATCAGTGATGACAGAAGTCAAAGTAGTGGTCACCTCTGGAAGGTGGGACATTGACC', 32673407))} cache = Mock(reference_id=MockFunction(16)) read = Blat.pslx_row_to_pysam(row, cache, refseq) self.assertEqual(16, read.reference_id) self.assertEqual('17', read.reference_name) self.assertEqual(row['qseq_full'], reverse_complement(read.query_sequence)) self.assertEqual([(CIGAR.S, 62), (CIGAR.EQ, 142)], read.cigar)
def test_blat_contigs_deletion_revcomp(self): ev = GenomeEvidence( Breakpoint('fake', 1714, orient=ORIENT.LEFT), Breakpoint('fake', 2968, orient=ORIENT.RIGHT), opposing_strands=False, bam_cache=BAM_CACHE, reference_genome=REFERENCE_GENOME, read_length=40, stdev_fragment_size=25, median_fragment_size=100, ) seq = ( 'GGTATATATTTCTCAGATAAAAGATATTTTCCCTTTTATCTTTCCCTAAGCTCACACTACATATATTGCATTTATCTTATATCTGCTTTAAAACCTATTTAT' 'TATGTCATTTAAATATCTAGAAAAGTTATGACTTCACCAGGTATGAAAAATATAAAAAGAACTCTGTCAAGAAT' ) ev.contigs = [Contig(reverse_complement(seq), 0)] align.select_contig_alignments( ev, align.align_sequences( {'seq': ev.contigs[0].seq}, BAM_CACHE, REFERENCE_GENOME, aligner_reference=get_data('mock_reference_genome.2bit'), aligner='blat', ), ) print('alignments:', ev.contigs[0].alignments) alignment = list(ev.contigs[0].alignments)[0] print(alignment) assert alignment.read2 is None assert alignment.read1.reference_id == 0 assert alignment.read1.is_reverse assert alignment.read1.query_sequence == seq assert align.query_coverage_interval(alignment.read1) == Interval(0, 175) assert alignment.read1.reference_start == 1612 assert alignment.read1.cigar == [(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)]
def test_pslx_row_to_pysam_inversion(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' # first part of the inversion pslx_row = { 'block_count': 1, 'tstarts': [1114], 'block_sizes': [120], 'qname': 'seq1', 'tname': 'reference3', 'qstarts': [125], 'strand': '+', 'qseq_full': s, 'score': 1, 'qseqs': [ 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGG' 'TTTTCATTTCTGTATGTTAAT' ], 'tseqs': [ 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGG' 'TTTTCATTTCTGTATGTTAAT' ], } read1 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(3, read1.reference_id) self.assertEqual(Interval(125, 244), query_coverage_interval(read1)) self.assertEqual(1114, read1.reference_start) self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], read1.cigar) # second part of the inversion pslx_row = { 'block_count': 1, 'tstarts': [2187], 'block_sizes': [128], 'qname': 'seq1', 'tname': 'reference3', 'qstarts': [117], 'strand': '-', 'qseq_full': s, 'score': 1, 'qseqs': [ 'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAAT' 'TCTGTGTTTACAGGGCTTTCATGCTCAG' ], 'tseqs': [ 'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAAT' 'TCTGTGTTTACAGGGCTTTCATGCTCAG' ], } read2 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(3, read2.reference_id) self.assertEqual(2187, read2.reference_start) self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], read2.cigar) self.assertEqual(Interval(117, 244), query_coverage_interval(read2)) self.assertEqual(read1.query_sequence, reverse_complement(read2.query_sequence))
def test_reverse_complement(self): assert reverse_complement('CGAT') == 'ATCG' assert reverse_complement('') == ''
def test_reverse_complement(self): self.assertEqual('ATCG', reverse_complement('CGAT')) self.assertEqual('', reverse_complement(''))