def test_k_exceeds_ref_length(self): """This is a regression test for b/64564513.""" # We don't allow a k >= ref length. This crashed prior to the bugfix. ref_str = 'GATTACA' dbg = debruijn_graph.build(ref_str, [], self.single_k_dbg_options(7)) self.assertIsNone(dbg) dbg = debruijn_graph.build(ref_str, [], self.single_k_dbg_options(8)) self.assertIsNone(dbg)
def test_filtering_by_qual(self): """Test that we filter out edges containing low-quality basecalls.""" ref_str = 'GATTACA' read_str = 'GATGTACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30, 30, 30, 1, 30, 30, 30, 30], name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(2)) self.assertGraphEqual("""\ digraph G { 0[label=GA]; 1[label=AT]; 2[label=TT]; 3[label=TA]; 4[label=AC]; 5[label=CA]; 0->1 [label=3 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=3 color=red]; 4->5 [label=3 color=red]; } """, dbg)
def test_pruning_2(self): """Test that pruning removes edges not between source and sink.""" ref_str = 'GATTACA' read_str = 'CCGATGACACC' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertGraphEqual("""\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)
def test_pruning_1(self): """Test that pruning removes a path traced by only one read.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read], self.single_k_dbg_options(3)) self.assertGraphEqual("""\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; } """, dbg)
def call_debruijn_graph(self, windows, reads): """Helper function to call debruijn_graph module.""" windows_haplotypes = [] # Build and process de-Bruijn graph for each window. sam_reader = sam.InMemorySamReader(reads) for window in windows: if window.end - window.start > self.config.ws_config.max_window_size: continue if not self.ref_reader.is_valid(window): continue ref = self.ref_reader.query(window) window_reads = list(sam_reader.query(window)) with timer.Timer() as t: graph = debruijn_graph.build(ref, window_reads, self.config.dbg_config) graph_building_time = t.GetDuration() if not graph: candidate_haplotypes = [ref] else: candidate_haplotypes = graph.candidate_haplotypes() if candidate_haplotypes and candidate_haplotypes != [ref]: candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes( span=window, haplotypes=candidate_haplotypes) windows_haplotypes.append(candidate_haplotypes_info) self.diagnostic_logger.log_graph_metrics( window, graph, candidate_haplotypes, graph_building_time) return windows_haplotypes
def call_debruijn_graph(self, windows, reads): """Helper function to call debruijn_graph module.""" windows_haplotypes = [] # Build and process de-Bruijn graph for each window. for window in windows: if window.end - window.start > self.config.ws_config.max_window_size: continue if not self.ref_reader.is_valid_interval(window): continue ref = self.ref_reader.bases(window) # redacted dbg_reads = [ read for read in reads if ranges.ranges_overlap(window, utils.read_range(read)) ] with timer.Timer() as t: graph = debruijn_graph.build(ref, dbg_reads, self.config.dbg_config) graph_building_time = t.GetDuration() if not graph: candidate_haplotypes = [ref] else: candidate_haplotypes = graph.candidate_haplotypes() if candidate_haplotypes and candidate_haplotypes != [ref]: candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes( span=window, haplotypes=candidate_haplotypes) windows_haplotypes.append(candidate_haplotypes_info) self.diagnostic_logger.log_graph_metrics(window, graph, candidate_haplotypes, graph_building_time) return windows_haplotypes
def test_pruning_1(self): """Test that pruning removes a path traced by only one read.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read], self.single_k_dbg_options(3)) self.assertGraphEqual( """\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; } """, dbg)
def test_pruning_2(self): """Test that pruning removes edges not between source and sink.""" ref_str = 'GATTACA' read_str = 'CCGATGACACC' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertGraphEqual( """\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)
def test_filtering_by_qual(self): """Test that we filter out edges containing low-quality basecalls.""" ref_str = 'GATTACA' read_str = 'GATGTACA' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30, 30, 30, 1, 30, 30, 30, 30], name='read') # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(2)) self.assertGraphEqual( """\ digraph G { 0[label=GA]; 1[label=AT]; 2[label=TT]; 3[label=TA]; 4[label=AC]; 5[label=CA]; 0->1 [label=3 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=3 color=red]; 4->5 [label=3 color=red]; } """, dbg)
def call_debruijn_graph(self, windows, reads): """Helper function to call debruijn_graph module.""" windows_haplotypes = [] # Build and process de-Bruijn graph for each window. for window in windows: if window.end - window.start > self.config.ws_config.max_window_size: continue if not self.ref_reader.is_valid(window): continue ref = self.ref_reader.query(window) # redacted dbg_reads = [ read for read in reads if ranges.ranges_overlap(window, utils.read_range(read)) ] with timer.Timer() as t: graph = debruijn_graph.build(ref, dbg_reads, self.config.dbg_config) graph_building_time = t.GetDuration() if not graph: candidate_haplotypes = [ref] else: candidate_haplotypes = graph.candidate_haplotypes() if candidate_haplotypes and candidate_haplotypes != [ref]: candidate_haplotypes_info = realigner_pb2.CandidateHaplotypes( span=window, haplotypes=candidate_haplotypes) windows_haplotypes.append(candidate_haplotypes_info) self.diagnostic_logger.log_graph_metrics( window, graph, candidate_haplotypes, graph_building_time) return windows_haplotypes
def test_adding_edges_with_bad_positions(self, bad_position, dropped_edges): """Test that we filter out edges containing low-quality basecalls.""" ref_str = 'GATTACA' read_str = 'GATTACA' kmer_indices = { 'GA': 0, 'AT': 1, 'TT': 2, 'TA': 3, 'AC': 4, 'CA': 5, } def kmer_to_index_edge(kmer_edge): k1, k2 = kmer_edge.split('->') return '{}->{}'.format(kmer_indices[k1], kmer_indices[k2]) dropped_edges = {kmer_to_index_edge(edge) for edge in dropped_edges} for bad_type in ['qual', 'base']: bases = list(read_str) quals = [30] * len(bases) cigar = [(len(bases), 'M')] if bad_position is not None: if bad_type == 'qual': quals[bad_position] = 1 elif bad_type == 'base': bases[bad_position] = 'N' else: raise ValueError('Unexpected base type') read = test_utils.make_read(''.join(bases), start=0, cigar=cigar, quals=quals) # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(2)) expected_edges = '\n'.join( '{} [label={} color=red];'.format( edge, 1 if edge in dropped_edges else 3) for edge in ['0->1', '1->2', '2->3', '3->4', '4->5']) self.assertGraphEqual( """\ digraph G { 0[label=GA]; 1[label=AT]; 2[label=TT]; 3[label=TA]; 4[label=AC]; 5[label=CA]; %s } """ % expected_edges, dbg)
def test_straightforward_region(self): ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,000,000-10,000,100') ref_seq = ref_reader.query(region) all_reads = list(bam_reader.query(region)) dbg30 = debruijn_graph.build(ref_seq, all_reads, self.single_k_dbg_options(30)) self.assertIsNotNone(dbg30) self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
def test_straightforward_region(self): ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,000,000-10,000,100') ref_seq = ref_reader.query(region) all_reads = list(bam_reader.query(region)) dbg30 = debruijn_graph.build(ref_seq, all_reads, self.single_k_dbg_options(30)) self.assertIsNotNone(dbg30) self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
def test_ref_cycle_detector(self, ref, smallest_good_k): min_k = max(smallest_good_k - 5, 1) max_k = min(smallest_good_k + 5, len(ref)) for k in range(min_k, max_k): # The build fails, returning None, with a k < smallest_good_k. If # k >= smallest_good_k, then we expect a real non-None instance. result = debruijn_graph.build(ref, [], self.single_k_dbg_options(k)) if k < smallest_good_k: self.assertIsNone(result, 'Cycle not detected for k={}'.format(k)) else: self.assertIsNotNone(result, 'False cycle detected for k={}'.format(k))
def test_complex_region(self): # There is a heterozygous 9 bp deletion of tandem TGA repeat. # "chr20:10,095,379-10,095,500" ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,095,379-10,095,500') ref_seq = ref_reader.query(region) reads = list(bam_reader.query(region)) dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options()) self.assertIsNotNone(dbg) self.assertEqual(44, dbg.kmer_size) self.assertEqual(2, len(dbg.candidate_haplotypes())) self.assertIn(ref_seq, dbg.candidate_haplotypes())
def test_complex_region(self): # There is a heterozygous 9 bp deletion of tandem TGA repeat. # "chr20:10,095,379-10,095,500" ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,095,379-10,095,500') ref_seq = ref_reader.query(region) reads = list(bam_reader.query(region)) dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options()) self.assertIsNotNone(dbg) self.assertEqual(44, dbg.kmer_size) self.assertEqual(2, len(dbg.candidate_haplotypes())) self.assertIn(ref_seq, dbg.candidate_haplotypes())
def test_k_exceeds_read_length(self): """This is a regression test for b/64564513.""" # If k > read length, no edges will go into the graph from this read. # This crashed prior to the bugfix. ref_str = 'GATTACATG' read_str = 'GATGACA' read = test_utils.make_read(read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(8)) self.assertIsNotNone(dbg)
def test_k_exceeds_read_length(self): """This is a regression test for b/64564513.""" # If k > read length, no edges will go into the graph from this read. # This crashed prior to the bugfix. ref_str = 'GATTACATG' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(8)) self.assertIsNotNone(dbg)
def test_basics(self): """Basic example.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') self.assertEqual(self.single_k_dbg_options(3).min_k, 3) # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertItemsEqual([ref_str, read_str], dbg.candidate_haplotypes()) self.assertGraphEqual( """\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)
def test_basics(self): """Basic example.""" ref_str = 'GATTACA' read_str = 'GATGACA' read = test_utils.make_read( read_str, chrom='chr20', start=1, cigar=[(len(read_str), 'M')], quals=[30] * len(read_str), name='read') self.assertEqual(self.single_k_dbg_options(3).min_k, 3) # Use two reads so read path doesn't get pruned. dbg = debruijn_graph.build(ref_str, [read, read], self.single_k_dbg_options(3)) self.assertItemsEqual([ref_str, read_str], dbg.candidate_haplotypes()) self.assertGraphEqual("""\ digraph G { 0[label=GAT]; 1[label=ATT]; 2[label=TTA]; 3[label=TAC]; 4[label=ACA]; 5[label=ATG]; 6[label=TGA]; 7[label=GAC]; 0->1 [label=1 color=red]; 1->2 [label=1 color=red]; 2->3 [label=1 color=red]; 3->4 [label=1 color=red]; 0->5 [label=2]; 5->6 [label=2]; 6->7 [label=2]; 7->4 [label=2]; } """, dbg)