def test_realigner_doesnt_create_invalid_intervals(self): """Tests that read sets don't result in a crash in reference_fai.cc.""" region = ranges.parse_literal('chr20:63,025,320-63,025,520') # pylint: disable=g-complex-comprehension reads = [ test_utils.make_read('ACCGT' * 50, start=63025520 - 250, cigar='250M', quals=list(np.tile(range(30, 35), 50))) for _ in range(20) ] # pylint: enable=g-complex-comprehension self.reads_realigner.realign_reads(reads, region) # These reads are aligned off the edge of the contig. Note that the # reference bases in this interval are all Ns as well. # pylint: disable=g-complex-comprehension reads = [ test_utils.make_read('TTATA' * 50, start=63025520 - 200, cigar='200M50S', quals=list(np.tile(range(30, 35), 50))) for _ in range(20) ] # pylint: enable=g-complex-comprehension self.reads_realigner.realign_reads(reads, region)
def test_make_read_produces_unique_read_names(self): start = 0 read1 = test_utils.make_read('A', start=start) read2 = test_utils.make_read('A', start=start) self.assertGreater(len(read1.fragment_name), 0) self.assertGreater(len(read2.fragment_name), 0) self.assertNotEqual(read1.fragment_name, read2.fragment_name)
def test_candidate_pos_low_qual(self): """Test WindowSelector.process_read() with reads of low quality.""" window = WindowSelector(self.test_ws_config()) ref = 'A' * 100 read_1 = test_utils.make_read( 'AAGA', start=10, cigar='4M', quals=[64, 64, 10, 30], name='read_1') read_2 = test_utils.make_read( 'AAGTA', start=10, cigar='2M2I1M', quals=[64, 64, 10, 30, 64], name='read_2') read_3 = test_utils.make_read( 'TGATAC', start=10, cigar='2S3M1S', quals=[64, 10, 64, 64, 64, 64], name='read_3') read_4 = test_utils.make_read( 'AAGA', start=10, cigar='2M1X1M', quals=[64, 64, 30, 10], name='read_4') self.assertEqual(list(window.process_read(ref, read_1)), []) self.assertEqual(list(window.process_read(ref, read_2)), [11, 13]) self.assertEqual(list(window.process_read(ref, read_3)), [8, 11, 13]) self.assertEqual(list(window.process_read(ref, read_4)), [12])
def test_candidate_pos_low_qual(self): """Test WindowSelector.process_read() with reads of low quality.""" window = WindowSelector(self.test_ws_config()) ref = 'A' * 100 read_1 = test_utils.make_read('AAGA', start=10, cigar='4M', quals=[64, 64, 10, 30], name='read_1') read_2 = test_utils.make_read('AAGTA', start=10, cigar='2M2I1M', quals=[64, 64, 10, 30, 64], name='read_2') read_3 = test_utils.make_read('TGATAC', start=10, cigar='2S3M1S', quals=[64, 10, 64, 64, 64, 64], name='read_3') read_4 = test_utils.make_read('AAGA', start=10, cigar='2M1X1M', quals=[64, 64, 30, 10], name='read_4') self.assertEqual(list(window.process_read(ref, read_1)), []) self.assertEqual(list(window.process_read(ref, read_2)), [11, 13]) self.assertEqual(list(window.process_read(ref, read_3)), [8, 11, 13]) self.assertEqual(list(window.process_read(ref, read_4)), [12])
def test_read_end(self, update_cached_read_end_first): """Tests reads have their ends calculated correctly.""" start = 10000001 read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') if update_cached_read_end_first: # Explicitly update cached_end. read.cached_end = utils.read_end(read, use_cached_read_end=False) self.assertEqual(start + 5, read.cached_end) self.assertEqual(start + 5, utils.read_end(read)) read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') if update_cached_read_end_first: # Explicitly update cached_end. read.cached_end = utils.read_end(read, use_cached_read_end=False) self.assertEqual(start + 5 + 16, read.cached_end) self.assertEqual(start + 5 + 16, utils.read_end(read))
def setUp(self): self.alt_allele = 'C' self.dv_call = _make_dv_call(ref_bases='G', alt_bases=self.alt_allele) self.pic = _make_image_creator(None, None, width=3, height=4, reference_band_height=2) self.ref = 'AGC' self.read1 = test_utils.make_read('AGC', start=0, cigar='3M', name='read1') self.read2 = test_utils.make_read('AGC', start=1, cigar='3M', name='read2') self.read3 = test_utils.make_read('AGC', start=2, cigar='3M', name='read3') self.read4 = test_utils.make_read('AGC', start=3, cigar='3M', name='read4') self.expected_rows = { 'ref': np.asarray(range(0, 3 * self.pic.num_channels), np.uint8).reshape(1, 3, self.pic.num_channels), 'empty': np.zeros((1, 3, self.pic.num_channels), dtype=np.uint8), 'read1': np.full((1, 3, self.pic.num_channels), 1, dtype=np.uint8), 'read2': np.full((1, 3, self.pic.num_channels), 2, dtype=np.uint8), 'read3': None, 'read4': np.full((1, 3, self.pic.num_channels), 3, dtype=np.uint8), } # Setup our shared mocks. mock_encoder = mock.Mock(spec=['encode_read', 'encode_reference']) mock_encoder.encode_reference.return_value = self.expected_rows['ref'] # pylint: disable=unused-argument def get_read_row(dv_call, refbases, read, pos, alt_allele): return self.expected_rows[read.fragment_name] mock_encoder.encode_read.side_effect = get_read_row self.mock_enc_ref = mock_encoder.encode_reference self.mock_enc_read = mock_encoder.encode_read self.pic._encoder = mock_encoder
def setUp(self): reads = [ test_utils.make_read('ACG', start=1, cigar='3M', name='read1'), test_utils.make_read('ACG', start=6, cigar='3M', name='read2'), test_utils.make_read('ACG', start=9, cigar='3M', name='read3'), test_utils.make_read('ACG', start=28, cigar='3M', name='read4'), test_utils.make_read('A' * 10, start=3, cigar='10M', name='read5'), ] self.reads = {read.fragment_name: read for read in reads} self.regions = { 'r1': _test_assembled_region('chr1:1-5'), 'r2': _test_assembled_region('chr1:10-15'), 'r3': _test_assembled_region('chr1:20-30'), } self.assembled_regions = [self.regions[r] for r in sorted(self.regions)]
def test_align_reads_simple(self, read_seq, expected_align_pos, expected_cigar, comment): """Test Aligner.align_reads(). Simple tests. Targets consist of - original reference sequence. - a sequence with 'AA' insertion at position 14 and - 'T' deletion at position 19. Args: read_seq: str, read sequence. expected_align_pos: int, expected aligned position expected_cigar: [(int, str)], expected cigar information. comment: str, test comment. """ ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT' region = ranges.make_range('ref', 10, 10 + len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) # redacted # implemented. For local alignment, it ensures that there are enough exact # matches between the reference and target for end-to-end alignment. targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT'] read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) self.assertEqual(expected_align_pos, aligned_reads[0].alignment.position.position, comment) self.assertEqual( _cigar.to_cigar_units(expected_cigar), list(aligned_reads[0].alignment.cigar), comment) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')] self.assertEqual( _cigar.to_cigar_units(expected_cigar_w_hard_clip), list(aligned_reads[0].alignment.cigar), comment)
def test_align_reads_simple(self, read_seq, expected_align_pos, expected_cigar, comment): """Test Aligner.align_reads(). Simple tests. Targets consist of - original reference sequence. - a sequence with 'AA' insertion at position 14 and - 'T' deletion at position 19. Args: read_seq: str, read sequence. expected_align_pos: int, expected aligned position expected_cigar: [(int, str)], expected cigar information. comment: str, test comment. """ ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT' region = ranges.make_range('ref', 10, 10 + len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) # redacted # implemented. For local alignment, it ensures that there are enough exact # matches between the reference and target for end-to-end alignment. targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT'] read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) self.assertEqual(expected_align_pos, aligned_reads[0].alignment.position.position, comment) self.assertEqual( _cigar.to_cigar_units(expected_cigar), list(aligned_reads[0].alignment.cigar), comment) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')] self.assertEqual( _cigar.to_cigar_units(expected_cigar_w_hard_clip), list(aligned_reads[0].alignment.cigar), comment)
def test_pruning_1(self): """Test that pruning removes a path traced by only one read.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read], self.single_k_dbg_options(3)) self.assertGraphEqual("""\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; } """, dbg)
def test_candidates_from_reads_all_cigars(self, bases, cigar, expected): """Test WindowSelector.process_read() with reads of low quality.""" read = test_utils.make_read(bases, start=10, cigar=cigar, quals=[64] * len(bases)) self.assertCandidatesFromReadsEquals(reads=[read], expected=expected)
def test_encode_read_matches(self): start = 10 dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] read = test_utils.make_read('ACCGT', start=start, cigar='5M', quals=range(10, 15), name='read1') full_expected = np.dstack([ # Base. (250, 30, 30, 180, 100), # Base quality. (63, 69, 76, 82, 88), # Mapping quality. (211, 211, 211, 211, 211), # Strand channel (forward or reverse) (70, 70, 70, 70, 70), # Supports alt or not. (254, 254, 254, 254, 254), # Matches ref or not. (50, 50, 254, 50, 50) ]).astype(np.uint8) self.assertImageRowEquals( _make_encoder().encode_read(dv_call, 'ACAGT', read, start, alt_allele), full_expected)
def test_encode_read_insertion(self): # ref: AA-CAG # read: AAACAG start = 2 read = test_utils.make_read('AAACAG', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] full_expected = np.dstack([ # Base. (250, 0, 30, 250, 180), # Base quality. (63, 76, 82, 88, 95), # Mapping quality. (211, 211, 211, 211, 211), # Strand channel (forward or reverse) (70, 70, 70, 70, 70), # Supports alt or not. (254, 254, 254, 254, 254), # Matches ref or not. (50, 254, 50, 50, 50) ]).astype(np.uint8) self.assertImageRowEquals( _make_encoder().encode_read(dv_call, 'AACAG', read, start, alt_allele), full_expected)
def test_encode_read_deletion(self): # ref: AACAG # read: AA--G start = 2 read = test_utils.make_read('AAG', start=start, cigar='2M2D1M', quals=range(10, 13), name='read1') dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] full_expected = np.dstack([ # Base. The second A is 0 because it's the anchor of the deletion. (250, 0, 0, 0, 180), # Base quality. (63, 69, 0, 0, 76), # Mapping quality. (211, 211, 0, 0, 211), # Strand channel (forward or reverse) (70, 70, 0, 0, 70), # Supports alt or not. (254, 254, 0, 0, 254), # Matches ref or not. (50, 254, 0, 0, 50) ]).astype(np.uint8) self.assertImageRowEquals( _make_encoder().encode_read(dv_call, 'AACAG', read, start, alt_allele), full_expected)
def test_read_support_is_respected(self, read_name, read_number, alt_allele, read_base, supports_alt): """supports_alt is encoded as the 5th channel out of the 7 channels.""" dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=10, end=11, reference_bases='A', alternate_bases=[alt_allele]), allele_support={ 'C': _supporting_reads('read1/1', 'read3/2'), 'G': _supporting_reads('read2/1', 'read2/2'), }) read = test_utils.make_read(read_base, start=dv_call.variant.start, cigar='1M', quals=[50], name=read_name) read.read_number = read_number actual = _make_encoder().encode_read(dv_call, 'TAT', read, dv_call.variant.start - 1, alt_allele) expected_base_values = {'C': 30, 'G': 180} expected_supports_alt_channel = [152, 254] expected = [ expected_base_values[read_base], 254, 211, 70, expected_supports_alt_channel[supports_alt], 254 ] self.assertEqual(list(actual[0, 1]), expected)
def test_read_support_is_respected(self, read_name, read_number, alt_allele, read_base, supports_alt): """supports_alt is encoded as the 5th channel out of the 7 channels.""" dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant( reference_name='chr1', start=10, end=11, reference_bases='A', alternate_bases=[alt_allele]), allele_support={ 'C': _supporting_reads('read1/1', 'read3/2'), 'G': _supporting_reads('read2/1', 'read2/2'), }) read = test_utils.make_read( read_base, start=dv_call.variant.start, cigar='1M', quals=[50], name=read_name) read.read_number = read_number actual = _make_encoder().encode_read(dv_call, 'TAT', read, dv_call.variant.start - 1, alt_allele) expected_base_values = {'C': 30, 'G': 180} expected_supports_alt_channel = [152, 254] expected = [ expected_base_values[read_base], 254, 211, 70, expected_supports_alt_channel[supports_alt], 254 ] self.assertEqual(list(actual[0, 1]), expected)
def test_encode_read_insertion(self): # ref: AA-CAG # read: AAACAG start = 2 read = test_utils.make_read( 'AAACAG', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] full_expected = np.dstack([ # Base. (250, 0, 30, 250, 180), # Base quality. (63, 76, 82, 88, 95), # Mapping quality. (211, 211, 211, 211, 211), # Strand channel (forward or reverse) (70, 70, 70, 70, 70), # Supports alt or not. (254, 254, 254, 254, 254), # Matches ref or not. (50, 254, 50, 50, 50) ]).astype(np.uint8) self.assertImageRowEquals(_make_encoder().encode_read( dv_call, 'AACAG', read, start, alt_allele), full_expected)
def test_pruning_1(self): """Test that pruning removes a path traced by only one read.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read], self.single_k_dbg_options(3)) self.assertGraphEqual( """\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; } """, dbg)
def test_pruning_2(self): """Test that pruning removes edges not between source and sink.""" ref_str = 'GATTACA' read_str = 'CCGATGACACC' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertGraphEqual( """\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)
def test_filtering_by_qual(self): """Test that we filter out edges containing low-quality basecalls.""" ref_str = 'GATTACA' read_str = 'GATGTACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30, 30, 30, 1, 30, 30, 30, 30], name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(2)) self.assertGraphEqual("""\ digraph G { 0[label=GA]; 1[label=AT]; 2[label=TT]; 3[label=TA]; 4[label=AC]; 5[label=CA]; 0->1 [label=3 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=3 color=red]; 4->5 [label=3 color=red]; } """, dbg)
def test_pruning_2(self): """Test that pruning removes edges not between source and sink.""" ref_str = 'GATTACA' read_str = 'CCGATGACACC' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertGraphEqual("""\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)
def test_trim_read(self, window, cigar, start, read_length, expected_cigar, expected_position, expected_read_length, comment): read = test_utils.make_read('A' * read_length, start=start, cigar=cigar, quals=[30] * read_length) region = ranges.parse_literal(window) output = realigner.trim_read(read, region) self.assertEqual(expected_cigar, cigar_utils.format_cigar_units( output.alignment.cigar), msg='Wrong cigar for case: {}'.format(comment)) # Start position of the alignment. self.assertEqual(output.alignment.position.position, expected_position, msg='Wrong position for case: {}'.format(comment)) # Read sequence. self.assertLen( output.aligned_sequence, expected_read_length, msg='Wrong length of aligned_sequence for case: {}'.format( comment)) # Base quality scores. self.assertLen( output.aligned_quality, expected_read_length, msg='Wrong length of aligned_quality for case: {}'.format( comment))
def test_realign_read(self, read_seq, target_seq, expected_align_start, expected_cigar, comment): """Test Aligner.test_align_read_to_target().""" read = aligner.Read( test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read')) align_reads = self.make_test_aligner(ref_seq=target_seq) align_reads.set_targets([target_seq]) align_reads.realign_read(read) if expected_align_start: self.assertEqual(align_reads.targets[0], read.target, comment) self.assertEqual(expected_align_start, read.target_offset + read.alignment.target_begin, comment) self.assertEqual(expected_cigar, read.alignment.cigar, comment) else: self.assertIsNone(read.target, comment) self.assertIsNone(read.target_offset, comment) self.assertIsNone(read.alignment, comment)
def test_select_windows(self): # Simple end-to-end test of the high-level select_windows function. We give # it a few reads with a single candidate at 100 and we expect a window back # centered at 100. reads = [ test_utils.make_read('AGA', start=99, cigar='3M', quals=[64] * 3), test_utils.make_read('AGA', start=99, cigar='3M', quals=[63] * 3), test_utils.make_read('AGA', start=99, cigar='3M', quals=[62] * 3), ] chrom = reads[0].alignment.position.reference_name ref_reader = fasta.InMemoryFastaReader([(chrom, 0, 'A' * 300)]) region = ranges.make_range(chrom, 0, 200) self.assertEqual( window_selector.select_windows(self.config, ref_reader, reads, region), [ranges.make_range(chrom, 96, 104)])
def test_no_bad_soft_clipping(self): self.skipTest('Enable when b/63143285 global alignment is fixed') common = 'CTA' read_seq = common + 'GA' ref_seq = 'N' + common + 'CA' + 'N' alt_seq = 'A' + ref_seq targets = [ref_seq, alt_seq] region = ranges.make_range('ref', 0, len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[35] * len(read_seq), name='read') realigned = align_reads.align_reads(targets, [read])[0] # redacted # 5M as we'd expect for this read: # read_seq: -CTAGA- # ref_seq : NCGTCAN # But the current algorithm produces a local alignment of the read against # the haplotypes, and the G <=> C mismatch causes the local aligner to # simply skip those bases instead of incurring the mismatch penalty for it, # resulting in a 3M2S read (GA clipped off) instead of the better 5M result. self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')], list(realigned.alignment.cigar))
def test_no_bad_soft_clipping(self): self.skipTest('Enable when b/63143285 global alignment is fixed') common = 'CTA' read_seq = common + 'GA' ref_seq = 'N' + common + 'CA' + 'N' alt_seq = 'A' + ref_seq targets = [ref_seq, alt_seq] region = ranges.make_range('ref', 0, len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[35] * len(read_seq), name='read') realigned = align_reads.align_reads(targets, [read])[0] # redacted # 5M as we'd expect for this read: # read_seq: -CTAGA- # ref_seq : NCGTCAN # But the current algorithm produces a local alignment of the read against # the haplotypes, and the G <=> C mismatch causes the local aligner to # simply skip those bases instead of incurring the mismatch penalty for it, # resulting in a 3M2S read (GA clipped off) instead of the better 5M result. self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')], list(realigned.alignment.cigar))
def test_make_read(self): bases = 'ACG' quals = [30, 40, 50] cigar = '3M' mapq = 42 chrom = 'chr10' start = 123 name = 'myname' read = test_utils.make_read(bases, quals=quals, cigar=cigar, mapq=mapq, chrom=chrom, start=start, name=name) self.assertEqual(read.aligned_sequence, bases) self.assertEqual(read.aligned_quality, quals) self.assertEqual(list(read.alignment.cigar), [ cigar_pb2.CigarUnit(operation_length=3, operation=cigar_pb2.CigarUnit.ALIGNMENT_MATCH) ]) self.assertEqual(read.alignment.mapping_quality, mapq) self.assertEqual(read.alignment.position.reference_name, chrom) self.assertEqual(read.alignment.position.position, start) self.assertEqual(read.fragment_name, name)
def test_candidates_from_reads_respects_mapq(self, read_mapq, min_mapq, expect_read_to_be_included): read = test_utils.make_read( 'AGA', start=10, cigar='3M', quals=[64] * 3, mapq=read_mapq) self.config.min_mapq = min_mapq self.assertCandidatesFromReadsEquals( reads=[read], expected=[11] if expect_read_to_be_included else [])
def test_realign_read(self, read_seq, target_seq, expected_align_start, expected_cigar, comment): """Test Aligner.test_align_read_to_target().""" read = aligner.Read( test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read')) align_reads = self.make_test_aligner(ref_seq=target_seq) align_reads.set_targets([target_seq]) align_reads.realign_read(read) if expected_align_start: self.assertEqual(align_reads.targets[0], read.target, comment) self.assertEqual(expected_align_start, read.target_offset + read.alignment.target_begin, comment) self.assertEqual(expected_cigar, read.alignment.cigar, comment) else: self.assertIsNone(read.target, comment) self.assertIsNone(read.target_offset, comment) self.assertIsNone(read.alignment, comment)
def test_filtering_by_qual(self): """Test that we filter out edges containing low-quality basecalls.""" ref_str = 'GATTACA' read_str = 'GATGTACA' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30, 30, 30, 1, 30, 30, 30, 30], name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(2)) self.assertGraphEqual( """\ digraph G { 0[label=GA]; 1[label=AT]; 2[label=TT]; 3[label=TA]; 4[label=AC]; 5[label=CA]; 0->1 [label=3 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=3 color=red]; 4->5 [label=3 color=red]; } """, dbg)
def test_encode_read_custom_pileup_read_deletion(self): pie = _make_encoder(custom_pileup_image=True, num_channels=7, insert_base_char='I', delete_base_char='D') # ref: AACAG # read: AA--G start = 2 read = test_utils.make_read('AAG', start=start, cigar='2M2D1M', quals=range(10, 13), name='read1') dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] full_expected = np.dstack([ # Base. Fills in the whole deletion with 130, starting at the anchor. (250, 130, 130, 130, 180), # Base quality. (63, 69, 0, 0, 76), # Mapping quality. (211, 211, 0, 0, 211), # Strand channel (forward or reverse) (70, 70, 0, 0, 70), # Supports alt or not. (254, 254, 0, 0, 254), # Matches ref or not. (50, 254, 0, 0, 50), # Operation length. (0, 2, 2, 2, 0) ]).astype(np.uint8) self.assertImageRowEquals( pie.encode_read(dv_call, 'AACAG', read, start, alt_allele), full_expected)
def test_adding_edges_with_bad_positions(self, bad_position, dropped_edges): """Test that we filter out edges containing low-quality basecalls.""" ref_str = 'GATTACA' read_str = 'GATTACA' kmer_indices = { 'GA': 0, 'AT': 1, 'TT': 2, 'TA': 3, 'AC': 4, 'CA': 5, } def kmer_to_index_edge(kmer_edge): k1, k2 = kmer_edge.split('->') return '{}->{}'.format(kmer_indices[k1], kmer_indices[k2]) dropped_edges = {kmer_to_index_edge(edge) for edge in dropped_edges} for bad_type in ['qual', 'base']: bases = list(read_str) quals = [30] * len(bases) cigar = [(len(bases), 'M')] if bad_position is not None: if bad_type == 'qual': quals[bad_position] = 1 elif bad_type == 'base': bases[bad_position] = 'N' else: raise ValueError('Unexpected base type') read = test_utils.make_read(''.join(bases), start=0, cigar=cigar, quals=quals) # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(2)) expected_edges = '\n'.join( '{} [label={} color=red];'.format( edge, 1 if edge in dropped_edges else 3) for edge in ['0->1', '1->2', '2->3', '3->4', '4->5']) self.assertGraphEqual( """\ digraph G { 0[label=GA]; 1[label=AT]; 2[label=TT]; 3[label=TA]; 4[label=AC]; 5[label=CA]; %s } """ % expected_edges, dbg)
def test_candidates_from_reads_counts_overlapping_events(self): # This read has a mismatch at position 2 and a 2 bp insertion at position 4, # so we need to double count the candidate positions from the mismatch and # insertion at position 2. read = test_utils.make_read( 'AAGACCAAA', start=0, cigar='4M2I3M', quals=[64] * 9) expected = [2, 3, 4, 5] self.assertCandidatesFromReadsEquals(reads=[read], expected=expected)
def test_read_end(self): """Tests reads have their ends calculated correctly.""" start = 10000001 read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') self.assertEqual(start + 5, utils.read_end(read)) read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') self.assertEqual(start + 5 + 16, utils.read_end(read))
def setUp(self): self.alt_allele = 'C' self.dv_call = _make_dv_call(ref_bases='G', alt_bases=self.alt_allele) self.pic = _make_image_creator( None, None, width=3, height=4, reference_band_height=2) self.ref = 'AGC' self.read1 = test_utils.make_read('AGC', start=0, cigar='3M', name='read1') self.read2 = test_utils.make_read('AGC', start=1, cigar='3M', name='read2') self.read3 = test_utils.make_read('AGC', start=2, cigar='3M', name='read3') self.read4 = test_utils.make_read('AGC', start=3, cigar='3M', name='read4') self.expected_rows = { 'ref': np.asarray( range(0, 3 * pileup_image.DEFAULT_NUM_CHANNEL), np.uint8) .reshape(1, 3, pileup_image.DEFAULT_NUM_CHANNEL), 'empty': np.zeros((1, 3, pileup_image.DEFAULT_NUM_CHANNEL), dtype=np.uint8), 'read1': np.full( (1, 3, pileup_image.DEFAULT_NUM_CHANNEL), 1, dtype=np.uint8), 'read2': np.full( (1, 3, pileup_image.DEFAULT_NUM_CHANNEL), 2, dtype=np.uint8), 'read3': None, 'read4': np.full( (1, 3, pileup_image.DEFAULT_NUM_CHANNEL), 3, dtype=np.uint8), } # Setup our shared mocks. mock_encoder = mock.Mock(spec=['encode_read', 'encode_reference']) mock_encoder.encode_reference.return_value = self.expected_rows['ref'] # pylint: disable=unused-argument def get_read_row(dv_call, refbases, read, pos, alt_allele): return self.expected_rows[read.fragment_name] mock_encoder.encode_read.side_effect = get_read_row self.mock_enc_ref = mock_encoder.encode_reference self.mock_enc_read = mock_encoder.encode_read self.pic._encoder = mock_encoder
def test_align_to_haplotype(self, read_seq, prefix, suffix, haplotypes, expected_cigars): test_read = test_utils.make_read(read_seq, start=1) reads = [test_read] # Align to each haplotype in turn. for i in range(len(haplotypes)): aligned_reads = self.reads_realigner.align_to_haplotype( haplotypes[i], haplotypes, prefix, suffix, reads, 'test', 1) self.assertEqual(len(reads), len(aligned_reads)) self.assertEqual(_get_cigar(aligned_reads[0]), expected_cigars[i])
def test_realigner_doesnt_create_invalid_intervals(self): """Tests that read sets don't result in a crash in reference_fai.cc.""" read = test_utils.make_read('ACCGT' * 50, start=63025520 - 250, cigar='250M', quals=range(30, 35) * 50, name='read1') reads = [read] * 20 region = ranges.parse_literal('chr20:63,025,320-63,025,520') self.reads_realigner.realign_reads(reads, region) # These reads are aligned off the edge of the contig. read = test_utils.make_read('TTATA' * 50, start=63025520 - 200, cigar='200M50S', quals=range(30, 35) * 50, name='read1') reads = [read] * 20 self.reads_realigner.realign_reads(reads, region)
def test_sw_start_offsets(self): """Test Aligner._sw_start_offsets().""" k = 3 read = aligner.Read( test_utils.make_read( 'AaGAt', start=0, cigar=[(5, 'M')], quals=[64] * 5, name='read_1')) read.set_read_kmers(k) target = aligner.Target('TgATCAGATAAG') target.build_target_index(k) self.assertEqual([-1, 4, 9], aligner._sw_start_offsets(target.kmer_index, read.kmers))
def test_sw_start_offsets(self): """Test Aligner._sw_start_offsets().""" k = 3 read = aligner.Read( test_utils.make_read( 'AaGAt', start=0, cigar=[(5, 'M')], quals=[64] * 5, name='read_1')) read.set_read_kmers(k) target = aligner.Target('TgATCAGATAAG') target.build_target_index(k) self.assertEqual([-1, 4, 9], aligner._sw_start_offsets(target.kmer_index, read.kmers))
def setUp(self): self.read1 = test_utils.make_read(bases='ACCGT', chrom='chr1', start=10, cigar='5M', mapq=50, quals=range(30, 35), name='read1') self.read2 = test_utils.make_read(bases='AACCTT', chrom='chr2', start=15, cigar='7M', mapq=40, quals=range(20, 26), name='read2') self.contigs = [ reference_pb2.ContigInfo(name='chr1'), reference_pb2.ContigInfo(name='chr2'), ] self.header = reads_pb2.SamHeader()
def test_process_read(self): """Test WindowSelector.process_read().""" window = WindowSelector(self.test_ws_config()) ref = 'A' * 100 read_1 = test_utils.make_read( 'AAGA', start=10, cigar='4M', quals=[64] * 4, name='read_1') read_2 = test_utils.make_read( 'AAGTA', start=10, cigar='2M2I1M', quals=[64] * 5, name='read_2') read_3 = test_utils.make_read( 'AAA', start=10, cigar='2M2D1M', quals=[64] * 3, name='read_3') read_4 = test_utils.make_read( 'TGATAC', start=10, cigar='2S3M1S', quals=[64] * 6, name='read_4') read_5 = test_utils.make_read( 'AAGA', start=10, cigar='2M1X1M', quals=[64] * 4, name='read_5') self.assertEqual(list(window.process_read(ref, read_1)), [12]) self.assertEqual(list(window.process_read(ref, read_2)), [10, 11, 12, 13]) self.assertEqual(list(window.process_read(ref, read_3)), [12, 13]) self.assertEqual(list(window.process_read(ref, read_4)), [8, 9, 11, 13]) self.assertEqual(list(window.process_read(ref, read_5)), [12])
def check_overlaps(chr1, start1, end1, chr2, start2, end2, expected): nbp = end1 - start1 read = test_utils.make_read('A' * nbp, chrom=chr1, start=start1, cigar='{}M'.format(nbp)) region = ranges.make_range(chr2, start2, end2) self.assertEqual(utils.read_overlaps_region(read, region), expected) # This check ensures we get the same result calling ranges.ranges_overlap. self.assertEqual( ranges.ranges_overlap(region, utils.read_range(read)), expected)
def setUp(self): self.read1 = test_utils.make_read( bases='ACCGT', chrom='chr1', start=10, cigar='5M', mapq=50, quals=range(30, 35), name='read1') self.read2 = test_utils.make_read( bases='AACCTT', chrom='chr2', start=15, cigar='7M', mapq=40, quals=range(20, 26), name='read2') self.contigs = [ reference_pb2.ContigInfo(name='chr1'), reference_pb2.ContigInfo(name='chr2'), ] self.header = reads_pb2.SamHeader()
def test_read_range(self): """Tests reads have their ranges calculated correctly.""" start = 10000001 read = test_utils.make_read( 'AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') self.assertEquals( ranges.make_range('chrX', start, start + 5), utils.read_range(read)) read = test_utils.make_read( 'AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') self.assertEquals( ranges.make_range('chrX', start, start + 5 + 16), utils.read_range(read))
def test_k_exceeds_read_length(self): """This is a regression test for b/64564513.""" # If k > read length, no edges will go into the graph from this read. # This crashed prior to the bugfix. ref_str = 'GATTACATG' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(8)) self.assertIsNotNone(dbg)
def test_sanity_check_readalignment(self, ref_name, ref_start, ref_end, read_chrom, read_start, read_len, read_cigar, exception_msg): """Test Aligner.sanity_check_readalignment().""" region = ranges.make_range(ref_name, ref_start, ref_end) ref_seq = 'A' * (ref_end - ref_start) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read( 'A' * read_len, chrom=read_chrom, start=read_start, cigar=read_cigar, quals=[64] * read_len, name='read') if exception_msg: with self.assertRaisesRegexp(ValueError, exception_msg): align_reads.sanity_check_readalignment(read) else: align_reads.sanity_check_readalignment(read)
def test_encode_read_spans2(self, bases_start, bases_end): bases = 'AAAACCGTCCC' quals = [9, 9, 9, 10, 11, 12, 13, 14, 8, 8, 8] bases_start_offset = 7 ref_start = 10 ref_size = 5 read_bases = bases[bases_start:bases_end] read_quals = quals[bases_start:bases_end] read_start = bases_start_offset + bases_start # Create our expected image row encoding. full_expected = np.dstack([ # Base. (250, 30, 30, 180, 100), # Base quality. (63, 69, 76, 82, 88), # Mapping quality. (211, 211, 211, 211, 211), # Strand channel (forward or reverse) (70, 70, 70, 70, 70), # Supports alt or not. (254, 254, 254, 254, 254), # Matches ref or not. (50, 50, 254, 50, 50) ]).astype(np.uint8) expected = np.zeros( (1, ref_size, pileup_image.DEFAULT_NUM_CHANNEL), dtype=np.uint8) for i in range(read_start, read_start + len(read_bases)): if ref_start <= i < ref_start + ref_size: expected[0, i - ref_start] = full_expected[0, i - ref_start] read = test_utils.make_read( read_bases, start=read_start, cigar=str(len(read_bases)) + 'M', quals=read_quals, name='read1') dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] self.assertImageRowEquals(_make_encoder().encode_read( dv_call, 'ACAGT', read, ref_start, alt_allele), expected)
def test_ignores_reads_with_low_quality_bases(self): dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant( reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) pie = _make_encoder() # Get the threshold the encoder uses. min_qual = pileup_image.DEFAULT_MIN_BASE_QUALITY for qual in range(0, min_qual + 5): quals = [min_qual - 1, qual, min_qual + 1] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') if qual < min_qual: self.assertIsNone(actual) else: self.assertIsNotNone(actual)
def test_basics(self): """Basic example.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') self.assertEqual(self.single_k_dbg_options(3).min_k, 3) # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertItemsEqual([ref_str, read_str], dbg.candidate_haplotypes()) self.assertGraphEqual("""\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)
def test_align_read_with_whole_clippd_seq(self): """Test Aligner.align_reads() when the whole read sequence is clipped.""" ref_seq = ('TTTGTTTGTTTGTGTTTGTGTTTTTGTTTGTTTGTGTTTGTGTTTGTTTGTGGTTTGTGT' 'GTTTGTGTTTGTGTTGGTTTG') ref_len = len(ref_seq) align_reads = self.make_test_aligner(ref_seq) target_ins = 'AAAAAGTGGGGGGGAAGTGGGGAAAAA' targets = [ ref_seq, ref_seq[:int(ref_len / 2)] + target_ins + ref_seq[int(ref_len / 2):] ] read_seq = 'CCC' + target_ins + 'CCC' read = test_utils.make_read( read_seq, chrom='ref', start=10, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) self.assertEqual(read, aligned_reads[0], 'Read should have its original alignment.')
def test_encode_read_matches(self): start = 10 dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] read = test_utils.make_read( 'ACCGT', start=start, cigar='5M', quals=range(10, 15), name='read1') full_expected = np.dstack([ # Base. (250, 30, 30, 180, 100), # Base quality. (63, 69, 76, 82, 88), # Mapping quality. (211, 211, 211, 211, 211), # Strand channel (forward or reverse) (70, 70, 70, 70, 70), # Supports alt or not. (254, 254, 254, 254, 254), # Matches ref or not. (50, 50, 254, 50, 50) ]).astype(np.uint8) self.assertImageRowEquals(_make_encoder().encode_read( dv_call, 'ACAGT', read, start, alt_allele), full_expected)
def test_encode_read_deletion(self): # ref: AACAG # read: AA--G start = 2 read = test_utils.make_read( 'AAG', start=start, cigar='2M2D1M', quals=range(10, 13), name='read1') dv_call = _make_dv_call() alt_allele = dv_call.variant.alternate_bases[0] full_expected = np.dstack([ # Base. The second A is 0 because it's the anchor of the deletion. (250, 0, 0, 0, 180), # Base quality. (63, 69, 0, 0, 76), # Mapping quality. (211, 211, 0, 0, 211), # Strand channel (forward or reverse) (70, 70, 0, 0, 70), # Supports alt or not. (254, 254, 0, 0, 254), # Matches ref or not. (50, 254, 0, 0, 50) ]).astype(np.uint8) self.assertImageRowEquals(_make_encoder().encode_read( dv_call, 'AACAG', read, start, alt_allele), full_expected)