def test_read_range(self, update_cached_read_end_first): """Tests reads have their ranges calculated correctly.""" start = 10000001 read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') if update_cached_read_end_first: # Explicitly update cached_end. read.cached_end = utils.read_end(read, use_cached_read_end=False) self.assertEqual(ranges.make_range('chrX', start, start + 5), utils.read_range(read)) read = test_utils.make_read('AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') if update_cached_read_end_first: # Explicitly update cached_end. read.cached_end = utils.read_end(read, use_cached_read_end=False) self.assertEqual(ranges.make_range('chrX', start, start + 5 + 16), utils.read_range(read))
def test_candidates_to_windows_min_window_distance(self, distance): candidates = [ # We one candidate at position 100 with a 5 count. 100, # We have another candidate at outside of our distance with a 5 count, # so it should produce a candidate but not be joined with our our # candidate at 100. 100 - 2 * distance - 1, # Finally, we have another variant that is exactly distance away from # 100. It should be joined with the candidate at 100 to produce a single # larger window. 100 + distance ] expected = [ # Our first window is for the 100 - 2 * distance one. ranges.make_range('ref', 100 - 3 * distance - 1, 100 - distance - 1), # Our second window starts at 100 (- distance for the window size) and # ends at 100 + distance + distance (again for window size). ranges.make_range('ref', 100 - distance, 100 + 2 * distance), ] self.config.min_windows_distance = distance self.assertEqual( window_selector._candidates_to_windows(self.config, candidates, 'ref'), expected)
def call_aligner(self, assembled_region): """Helper function to call aligner module.""" if not assembled_region.reads: return [] contig = assembled_region.region.reference_name ref_start = max( 0, min(assembled_region.read_span.start, assembled_region.region.start) - _REF_ALIGN_MARGIN) ref_end = min( self.ref_reader.contig(contig).n_bases, max(assembled_region.read_span.end, assembled_region.region.end) + _REF_ALIGN_MARGIN) ref_prefix = self.ref_reader.query( ranges.make_range(contig, ref_start, assembled_region.region.start)) ref = self.ref_reader.query(assembled_region.region) # If we can't create the ref suffix then return the original alignments. if ref_end <= assembled_region.region.end: return assembled_region.reads else: ref_suffix = self.ref_reader.query( ranges.make_range(contig, assembled_region.region.end, ref_end)) ref_region = ranges.make_range(contig, ref_start, ref_end) ref_seq = ref_prefix + ref + ref_suffix reads_aligner = aligner.Aligner(self.config.aln_config, ref_region, ref_seq) return reads_aligner.align_reads([ ref_prefix + target + ref_suffix for target in assembled_region.haplotypes ], assembled_region.reads)
def test_parse_literal_one_bp(self): self.assertEqual(ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10)) self.assertEqual(ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100)) self.assertEqual(ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))
def test_envelops(self): start_ix = 5 end_ix = 10 start_ix2 = end_ix + 1 end_ix2 = end_ix + 5 range_set = ranges.RangeSet([ ranges.make_range('chr1', start_ix, end_ix), ranges.make_range('chr1', start_ix2, end_ix2) ]) # No start position before the first start range is enveloped. for i in range(start_ix): self.assertFalse(range_set.envelops('chr1', i, start_ix + 1)) # All regions within a single record are enveloped. for six in range(start_ix, end_ix): for eix in range(six, end_ix + 1): self.assertTrue(range_set.envelops('chr1', six, eix), 'chr1 {} {} not enveloped'.format(six, eix)) # Bridging across two ranges is not enveloped. for six in range(start_ix, end_ix): for eix in range(start_ix2, end_ix2 + 1): self.assertFalse(range_set.envelops('chr1', six, eix)) # Other chromosome is not spanned. self.assertFalse(range_set.envelops('chr2', start_ix, start_ix + 1))
def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] self.assertItemsEqual( [ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20)], ranges.RangeSet.from_regions( literals, ranges.contigs_dict(_TEST_CONTIGS)))
def test_detector_ranges(self): test_ranges = [ ranges.make_range('chr1', 0, 5), ranges.make_range('chr1', 8, 10), ranges.make_range('chr1', 12, 13), ranges.make_range('chr2', 2, 5), ] range_set = ranges.RangeSet(test_ranges) self.assertEqual(bool(range_set), True) self.assertEqual(len(range_set), 4) self.assertEqual(range_set.overlaps('chr1', 0), True) self.assertEqual(range_set.overlaps('chr1', 1), True) self.assertEqual(range_set.overlaps('chr1', 2), True) self.assertEqual(range_set.overlaps('chr1', 3), True) self.assertEqual(range_set.overlaps('chr1', 4), True) self.assertEqual(range_set.overlaps('chr1', 5), False) self.assertEqual(range_set.overlaps('chr1', 6), False) self.assertEqual(range_set.overlaps('chr1', 7), False) self.assertEqual(range_set.overlaps('chr1', 8), True) self.assertEqual(range_set.overlaps('chr1', 9), True) self.assertEqual(range_set.overlaps('chr1', 10), False) self.assertEqual(range_set.overlaps('chr1', 11), False) self.assertEqual(range_set.overlaps('chr1', 12), True) self.assertEqual(range_set.overlaps('chr1', 13), False) self.assertEqual(range_set.overlaps('chr1', 100), False) self.assertEqual(range_set.overlaps('chr1', 1000), False) self.assertEqual(range_set.overlaps('chr2', 0), False) self.assertEqual(range_set.overlaps('chr2', 1), False) self.assertEqual(range_set.overlaps('chr2', 2), True) self.assertEqual(range_set.overlaps('chr2', 3), True) self.assertEqual(range_set.overlaps('chr2', 4), True) self.assertEqual(range_set.overlaps('chr2', 5), False) self.assertEqual(range_set.overlaps('chr2', 6), False) self.assertEqual(range_set.overlaps('chr3', 3), False)
def test_partitions(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('chrM', 0, 100), ranges.make_range('chr1', 0, 76), ranges.make_range('chr2', 0, 121), ]) self.assertCountEqual([ranges.make_range(*args) for args in expected], rangeset.partition(interval_size))
def test_partition_of_multiple_intervals(self, interval_size, expected): rangeset = ranges.RangeSet([ ranges.make_range('1', 0, 10), ranges.make_range('1', 20, 40), ranges.make_range('1', 45, 50), ]) self.assertCountEqual([ranges.make_range(*args) for args in expected], rangeset.partition(interval_size))
def test_query_edge_cases(self): reader = fasta.InMemoryRefReader([('1', 0, 'ACGT')]) # Check that we can query the first base correctly. self.assertEqual(reader.query(ranges.make_range('1', 0, 1)), 'A') # Check that we can query the last base correctly. self.assertEqual(reader.query(ranges.make_range('1', 3, 4)), 'T') # Check that we can query the entire sequence correctly. self.assertEqual(reader.query(ranges.make_range('1', 0, 4)), 'ACGT')
def test_from_bed(self, bed_filename): source = test_utils.genomics_core_testdata(bed_filename) self.assertCountEqual([ ranges.make_range('chr1', 1, 10), ranges.make_range('chr2', 20, 30), ranges.make_range('chr2', 40, 60), ranges.make_range('chr3', 80, 90), ], ranges.RangeSet.from_bed(source))
def test_from_contigs(self): contigs = [ reference_pb2.ContigInfo(name='chr1', n_bases=10), reference_pb2.ContigInfo(name='chr2', n_bases=5), ] self.assertCountEqual([ ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 0, 5), ], ranges.RangeSet.from_contigs(contigs))
def test_find_max_overlapping_returns_least_index(self): query_range = ranges.make_range('1', 0, 10) search_ranges = [ ranges.make_range('1', 0, 5), ranges.make_range('1', 5, 10) ] for to_search in [search_ranges, list(reversed(search_ranges))]: self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))
def test_partitions_bad_interval_size_raises(self): # list() is necessary to force the generator to execute. with self.assertRaisesRegexp(ValueError, 'max_size'): list( ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(-10)) with self.assertRaisesRegexp(ValueError, 'max_size'): list( ranges.RangeSet([ranges.make_range('chrM', 0, 100)]).partition(0))
def test_from_regions_not_empty(self): literals = ['chr1', 'chr2:10-20'] contig_map = { 'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10), 'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=100), } self.assertItemsEqual( [ranges.make_range('chr1', 0, 10), ranges.make_range('chr2', 9, 20)], ranges.RangeSet.from_regions(literals, contig_map))
def test_dispatching_reader(self): with fasta.FastaReader( test_utils.genomics_core_testdata('test.fasta')) as reader: # The reader is an instance of IndexedFastaReader which supports query(). self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC') with fasta.FastaReader( test_utils.genomics_core_testdata('unindexed.fasta')) as reader: # The reader is an instance of UnindexedFastaReader which doesn't support # query(). with self.assertRaises(NotImplementedError): reader.query(ranges.make_range('chrM', 1, 5))
def test_bed_parser(self): test_bed_path = test_utils.test_tmpfile( 'test_bed_parser.bed', '\n'.join([ 'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182', 'chr20\t61286467\t61286789' ])) self.assertEqual(list(ranges.bed_parser(test_bed_path)), [ ranges.make_range('chr20', 61724611, 61725646), ranges.make_range('chr20', 61304163, 61305182), ranges.make_range('chr20', 61286467, 61286789), ])
def test_bedpe_parser_skips_cross_chr_events(self): # pylint: disable=line-too-long data = [ 'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ] self.assertEqual( list(ranges.parse_lines(data, 'bedpe')), [ ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ])
def setUp(self): out_fname = test_utils.test_tmpfile('output.gff') self.writer = gff_writer.GffWriter.to_file(out_fname, gff_pb2.GffHeader(), gff_pb2.GffWriterOptions()) self.expected_gff_content = open( test_utils.genomics_core_testdata( 'test_features.gff')).readlines() self.header = gff_pb2.GffHeader( sequence_regions=[ranges.make_range('ctg123', 0, 1497228)]) self.record = gff_pb2.GffRecord( range=ranges.make_range('ctg123', 1000, 1100))
def test_bed_parser(self): data = [ 'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182', 'chr20\t61286467\t61286789', ] self.assertEqual( list(ranges.parse_lines(data, 'bed')), [ ranges.make_range('chr20', 61724611, 61725646), ranges.make_range('chr20', 61304163, 61305182), ranges.make_range('chr20', 61286467, 61286789), ])
def test_variant_position_and_range(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10) pos = ranges.make_range('1', 10, 11) range_ = ranges.make_range('1', 10, 14) v1_range_tuple = ('1', 10, 11) v2_range_tuple = ('1', 10, 14) self.assertEqual(pos, variant_utils.variant_position(v1)) self.assertEqual(pos, variant_utils.variant_position(v2)) self.assertEqual(pos, variant_utils.variant_range(v1)) self.assertEqual(range_, variant_utils.variant_range(v2)) self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1)) self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
def test_expand_raises_with_missing_contig_in_map(self): # Empty contig_map should raise. with self.assertRaises(KeyError): ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={}) # Missing '1' from the contig map should raise. with self.assertRaises(KeyError): ranges.expand( ranges.make_range('1', 10, 20), 1, contig_map={ '2': reference_pb2.ContigInfo(name='2', n_bases=50), })
def test_bedpe_parser_skips_cross_chr_events(self): # pylint: disable=line-too-long data = '\n'.join([ 'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION', 'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION', 'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION', ]) test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser2.bedpe', data) self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [ ranges.make_range('chr20', 25972820, 26045538), ranges.make_range('chr20', 23719873, 23796523), ])
def test_find_max_overlapping_allows_unordered_search_ranges(self): query_range = ranges.make_range('1', 4, 12) search_ranges = [ ranges.make_range('1', 0, 10), ranges.make_range('1', 10, 20), ranges.make_range('1', 12, 20) ] max_overlapping_range = search_ranges[0] for permutated_ranges in itertools.permutations(search_ranges): self.assertEqual( permutated_ranges.index(max_overlapping_range), ranges.find_max_overlapping(query_range, permutated_ranges))
def read_span(self): if self._read_span is None and self.reads: spans = [utils.read_range(r) for r in self.reads] self._read_span = ranges.make_range(spans[0].reference_name, min(s.start for s in spans), max(s.end for s in spans)) return self._read_span
def test_no_bad_soft_clipping(self): self.skipTest('Enable when b/63143285 global alignment is fixed') common = 'CTA' read_seq = common + 'GA' ref_seq = 'N' + common + 'CA' + 'N' alt_seq = 'A' + ref_seq targets = [ref_seq, alt_seq] region = ranges.make_range('ref', 0, len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[35] * len(read_seq), name='read') realigned = align_reads.align_reads(targets, [read])[0] # redacted # 5M as we'd expect for this read: # read_seq: -CTAGA- # ref_seq : NCGTCAN # But the current algorithm produces a local alignment of the read against # the haplotypes, and the G <=> C mismatch causes the local aligner to # simply skip those bases instead of incurring the mismatch penalty for it, # resulting in a 3M2S read (GA clipped off) instead of the better 5M result. self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')], list(realigned.alignment.cigar))
def test_call_from_allele_counter(self): ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 1000 region = ranges.make_range('chr20', 10000000, 10000000 + size) allele_counter = _allelecounter.AlleleCounter( ref.c_reader, region, deepvariant_pb2.AlleleCounterOptions(partition_size=size)) caller = variant_calling.VariantCaller( deepvariant_pb2.VariantCallerOptions(min_count_snps=2, min_count_indels=2, min_fraction_snps=0.12, min_fraction_indels=0.12, sample_name='sample_name', p_error=0.001, max_gq=50, gq_resolution=1, ploidy=2)) # Grab all of the reads in our region and add them to the allele_counter. reads = list(sam_reader.query(region)) self.assertNotEmpty(reads) for read in reads: allele_counter.add(read) # Get the candidates records for this whole region. candidates = caller.calls_from_allele_counter(allele_counter) # We should have at least some candidates and some gvcf records. self.assertNotEmpty(candidates) # Each candidate should be a DeepVariantCall. for candidate in candidates: self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
def setUp(self): tfrecord_file = test_utils.genomics_core_testdata( 'test_features.gff.tfrecord') self.records = list( io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord)) self.header = gff_pb2.GffHeader( sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
def test_overlaps_variant_with_ranges(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)]) with mock.patch.object(range_set, 'overlaps') as mock_overlaps: mock_overlaps.return_value = True self.assertEqual(range_set.variant_overlaps(variant), True) mock_overlaps.assert_called_once_with('chr2', 10)
def test_good_query(self): for contig in self.fasta_reader.header.contigs: for start in range(contig.n_bases): for end in range(start, contig.n_bases): region = ranges.make_range(contig.name, start, end) self.assertEqual(self.in_mem.query(region), self.fasta_reader.query(region))
def test_good_query(self): for contig in self.fasta_reader.header.contigs: for start in range(contig.n_bases): for end in range(start, contig.n_bases): region = ranges.make_range(contig.name, start, end) self.assertEqual( self.in_mem.query(region), self.fasta_reader.query(region))
def test_label_variants(self, candidate, expected_confident, expected_truth, expected_label=None, variant_alt_alleles_indices=None): if variant_alt_alleles_indices is None: variant_alt_alleles_indices = [0] labeler = self._make_labeler( self.variants, ranges.RangeSet( [ranges.make_range(self.snp_class1.reference_name, 10, 100)])) # Call _match so we can compare our expected truth with the actual one. is_confident, truth_variant = labeler._match(candidate) self.assertEqual(expected_truth, truth_variant) self.assertEqual(is_confident, expected_confident) # Now call label_variants to exercise the higher-level API. classes_dict = ( customized_classes_labeler.CustomizedClassesVariantLabel.classes_dict) if expected_label is None and expected_truth is not None: expected_class_str = expected_truth.info[ customized_classes_labeler.CustomizedClassesVariantLabel. info_field_name ].values[0].string_value expected_label = classes_dict[expected_class_str] labels = list(labeler.label_variants([candidate])) self.assertEqual(len(labels), 1) self.assertEqual(candidate, labels[0].variant) self.assertEqual(expected_confident, labels[0].is_confident) self.assertEqual( expected_label, labels[0].label_for_alt_alleles(variant_alt_alleles_indices))
def make_labeler_ref(self, candidate_variants, true_variants, bufsize=20): all_variants = candidate_variants + true_variants contig = all_variants[0].reference_name start = min(x.start for x in all_variants) end = max(x.end for x in all_variants) region = ranges.make_range(contig, start - 1, end + bufsize) ref_bases = self._ref_reader.query(region) return ReferenceRegion(ref_bases, start=region.start)
def test_get_reference_bases_good_region(self): self.dv_call.variant.start = 10 region = ranges.make_range(self.variant.reference_name, 8, 13) actual = self.pic.get_reference_bases(self.variant) self.assertEqual('ACGT', actual) self.mock_ref_reader.is_valid.assert_called_once_with(region) self.mock_ref_reader.query.assert_called_once_with(region)
def setUpClass(cls): cls.fasta_reader = fasta.RefFastaReader( test_utils.genomics_core_testdata('test.fasta')) cls.in_mem = fasta.InMemoryRefReader( [(contig.name, 0, cls.fasta_reader.query( ranges.make_range(contig.name, 0, contig.n_bases))) for contig in cls.fasta_reader.header.contigs])
def variant_range(variant): """Returns a new Range covering variant. Args: variant: third_party.nucleus.protos.Variant. Returns: A new Range with the same reference_name, start, and end as variant. """ return ranges.make_range(variant.reference_name, variant.start, variant.end)
def test_non_zero_start_query(self): """Checks all of the ways we can construct an InMemoryRefReader.""" bases = 'ACGTAACCGGTT' for start in range(len(bases)): reader = fasta.InMemoryRefReader([('1', start, bases[start:])]) self.assertEqual(reader.header.contigs[0].name, '1') self.assertEqual(reader.header.contigs[0].n_bases, len(bases)) # Check that our query operation works as expected with a start position. for end in range(start, len(bases)): self.assertEqual(reader.query(ranges.make_range('1', start, end)), bases[start:end])
def read_range(read): """Creates a Range proto from the alignment of Read. Args: read: the read to calculate range Returns: A third_party.nucleus.protos.Range for read. """ start = read.alignment.position.position end = start + cigar.alignment_length(read.alignment.cigar) return ranges.make_range(read.alignment.position.reference_name, start, end)
def test_read_range(self): """Tests reads have their ranges calculated correctly.""" start = 10000001 read = test_utils.make_read( 'AAACAG', chrom='chrX', start=start, cigar='2M1I3M', quals=range(10, 16), name='read1') self.assertEquals( ranges.make_range('chrX', start, start + 5), utils.read_range(read)) read = test_utils.make_read( 'AAACAG', chrom='chrX', start=start, cigar='2M16D3M', quals=range(10, 16), name='read1') self.assertEquals( ranges.make_range('chrX', start, start + 5 + 16), utils.read_range(read))
def variant_position(variant): """Returns a new Range at the start position of variant. Args: variant: third_party.nucleus.protos.Variant. Returns: A new Range with the same reference_name as variant and start but an end that is start + 1. This produces a range that is the single basepair of the start of variant, hence the name position. """ return ranges.make_range(variant.reference_name, variant.start, variant.start + 1)
def test_wrap(self): ref = fasta.RefFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 100 region = ranges.make_range('chr20', 10000000, 10000000 + size) options = deepvariant_pb2.AlleleCounterOptions(partition_size=size) allele_counter = _allelecounter.AlleleCounter(ref.get_c_reader(), region, options) reads = list(sam_reader.query(region)) self.assertGreater(len(reads), 0) for read in reads: allele_counter.add(read) counts = allele_counter.counts() self.assertEqual(len(counts), size)
def get_reads(self, variant): """Gets the reads used to construct the pileup image around variant. Args: variant: A third_party.nucleus.protos.Variant proto describing the variant we are creating the pileup image of. Returns: A list of third_party.nucleus.protos.Read protos. """ query_start = variant.start - self._options.read_overlap_buffer_bp query_end = variant.end + self._options.read_overlap_buffer_bp region = ranges.make_range(variant.reference_name, query_start, query_end) return list(self._sam_reader.query(region))
def test_wrap(self, fasta_filename): chr_names = ['chrM', 'chr1', 'chr2'] chr_lengths = [100, 76, 121] fasta = test_utils.genomics_core_testdata(fasta_filename) fai = test_utils.genomics_core_testdata(fasta_filename + '.fai') with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref: self.assertEqual(ref.n_contigs, 3) self.assertIn(fasta, ref.fasta_path) self.assertIn('GenomeReference backed by htslib FAI index', str(ref)) self.assertEqual(ref.contig_names, chr_names) self.assertEqual(ref.n_bp, sum(chr_lengths)) self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT') self.assertTrue(ref.is_valid_interval(ranges.make_range('chrM', 1, 10))) self.assertFalse( ref.is_valid_interval(ranges.make_range('chrM', 1, 100000))) self.assertEqual(len(ref.contigs), 3) self.assertEqual([c.name for c in ref.contigs], chr_names) self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths) for contig in ref.contigs: self.assertEqual(ref.contig(contig.name), contig) self.assertTrue(ref.has_contig(contig.name)) self.assertFalse(ref.has_contig(contig.name + '.unknown'))
def test_make_labeler_ref(self, candidates, truths, expected_start, expected_end, bufsize): expected_bases = 'A' * (expected_end - expected_start) labeler = _make_labeler() labeler._ref_reader.query.return_value = expected_bases labeler_ref = labeler.make_labeler_ref(candidates, truths, bufsize=bufsize) labeler._ref_reader.query.assert_called_once_with( ranges.make_range('20', expected_start, expected_end)) self.assertEqual(labeler_ref.start, expected_start) self.assertEqual(labeler_ref.end, expected_end) self.assertEqual( labeler_ref.bases(expected_start, expected_end), expected_bases)
def test_align_reads_simple(self, read_seq, expected_align_pos, expected_cigar, comment): """Test Aligner.align_reads(). Simple tests. Targets consist of - original reference sequence. - a sequence with 'AA' insertion at position 14 and - 'T' deletion at position 19. Args: read_seq: str, read sequence. expected_align_pos: int, expected aligned position expected_cigar: [(int, str)], expected cigar information. comment: str, test comment. """ ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT' region = ranges.make_range('ref', 10, 10 + len(ref_seq)) align_reads = self.make_test_aligner(ref_seq, region) # redacted # implemented. For local alignment, it ensures that there are enough exact # matches between the reference and target for end-to-end alignment. targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT'] read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(len(read_seq), 'M')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) self.assertEqual(expected_align_pos, aligned_reads[0].alignment.position.position, comment) self.assertEqual( _cigar.to_cigar_units(expected_cigar), list(aligned_reads[0].alignment.cigar), comment) read = test_utils.make_read( read_seq, chrom='ref', start=0, cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')], quals=[64] * len(read_seq), name='read') aligned_reads = align_reads.align_reads(targets, [read]) expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')] self.assertEqual( _cigar.to_cigar_units(expected_cigar_w_hard_clip), list(aligned_reads[0].alignment.cigar), comment)
def _make_labeler(truth_variants=None, confident_regions=None, **kwargs): mock_ref_reader = mock.MagicMock() if confident_regions is None: # Use the reference of the truth variants if possible, otherwise just use # a dummy placeholder value for the contig name and make the confident # region a giant span. contig = truth_variants[0].reference_name if truth_variants else 'dummy' confident_regions = ranges.RangeSet( [ranges.make_range(contig, 0, 1000000000)]) return haplotype_labeler.HaplotypeLabeler( truth_vcf_reader=vcf.InMemoryVcfReader(truth_variants or []), ref_reader=mock_ref_reader, confident_regions=confident_regions, **kwargs)
def get_reference_bases(self, variant): """Gets the reference bases used to make the pileup image around variant. Args: variant: A third_party.nucleus.protos.Variant proto describing the variant we are creating the pileup image of. Returns: A string of reference bases or None. Returns None if the reference interval for variant isn't valid for some reason. """ start = variant.start - self.half_width end = start + self._options.width region = ranges.make_range(variant.reference_name, start, end) if self._ref_reader.is_valid(region): return self._ref_reader.query(region) else: return None
def test_match_selects_variant_by_start(self): # Tests that match() selects the variant at the same start even if that # variant doesn't have the same alleles at candidate and there's an # overlapping with the same alleles. overlapping = [ test_utils.make_variant(start=20, alleles=['CC', 'A'], gt=[1, 1]), test_utils.make_variant(start=21, alleles=['AAA', 'A'], gt=[0, 1]), test_utils.make_variant(start=22, alleles=['AA', 'A'], gt=[1, 1]), ] candidate = test_utils.make_variant(start=21, alleles=['CC', 'A']) labeler = self._make_labeler( overlapping, ranges.RangeSet( [ranges.make_range(overlapping[0].reference_name, 0, 100)])) is_confident, truth_variant = labeler._match(candidate) self.assertEqual(is_confident, True) self.assertEqual(truth_variant, overlapping[1])
def test_sanity_check_readalignment(self, ref_name, ref_start, ref_end, read_chrom, read_start, read_len, read_cigar, exception_msg): """Test Aligner.sanity_check_readalignment().""" region = ranges.make_range(ref_name, ref_start, ref_end) ref_seq = 'A' * (ref_end - ref_start) align_reads = self.make_test_aligner(ref_seq, region) read = test_utils.make_read( 'A' * read_len, chrom=read_chrom, start=read_start, cigar=read_cigar, quals=[64] * read_len, name='read') if exception_msg: with self.assertRaisesRegexp(ValueError, exception_msg): align_reads.sanity_check_readalignment(read) else: align_reads.sanity_check_readalignment(read)
def test_label_variants(self, candidate, expected_confident, expected_truth, expected_genotype=None): labeler = self._make_labeler( self.variants, ranges.RangeSet([ranges.make_range(self.snp.reference_name, 10, 100)])) # Call _match so we can compare our expected truth with the actual one. is_confident, truth_variant = labeler._match(candidate) self.assertEqual(expected_truth, truth_variant) self.assertEqual(is_confident, expected_confident) # Now call label_variants to exercise the higher-level API. if expected_genotype is None and expected_truth is not None: expected_genotype = tuple(expected_truth.calls[0].genotype) labels = list(labeler.label_variants([candidate])) self.assertEqual(len(labels), 1) self.assertEqual(candidate, labels[0].variant) self.assertEqual(expected_confident, labels[0].is_confident) self.assertEqual(expected_genotype, labels[0].genotype)
def _create_record_from_template(template, start, end, fasta_reader): """Returns a copy of the template variant with the new start and end. Updates to the start position cause a different reference base to be set. Args: template: third_party.nucleus.protos.Variant. The template variant whose non-location and reference base information to use. start: int. The desired new start location. end: int. The desired new end location. fasta_reader: GenomeReferenceFai object. The reader used to determine the correct start base to use for the updated variant. Returns: An updated third_party.nucleus.protos.Variant with the proper start, end, and reference base set and all other fields inherited from the template. """ retval = copy.deepcopy(template) retval.start = start retval.end = end if start != template.start: retval.reference_bases = fasta_reader.query( ranges.make_range(retval.reference_name, start, start + 1)) return retval
def test_call_from_allele_counter(self): ref = fasta.RefFastaReader(testdata.CHR20_FASTA) sam_reader = sam.SamReader(testdata.CHR20_BAM) size = 1000 region = ranges.make_range('chr20', 10000000, 10000000 + size) allele_counter = _allelecounter.AlleleCounter( ref.get_c_reader(), region, deepvariant_pb2.AlleleCounterOptions(partition_size=size)) caller = variant_calling.VariantCaller( deepvariant_pb2.VariantCallerOptions( min_count_snps=2, min_count_indels=2, min_fraction_snps=0.12, min_fraction_indels=0.12, sample_name='sample_name', p_error=0.001, max_gq=50, gq_resolution=1, ploidy=2)) # Grab all of the reads in our region and add them to the allele_counter. reads = list(sam_reader.query(region)) self.assertNotEmpty(reads) for read in reads: allele_counter.add(read) # Get the candidates records for this whole region. candidates = caller.calls_from_allele_counter(allele_counter) # We should have at least some candidates and some gvcf records. self.assertNotEmpty(candidates) # Each candidate should be a DeepVariantCall. for candidate in candidates: self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
def test_make_ref_reader_default(self, fasta_filename): fasta_path = test_utils.genomics_core_testdata(fasta_filename) with fasta.RefFastaReader(fasta_path) as reader: self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
def bases(self, start, end): return self.query(ranges.make_range(self._DUMMY_CHROM_NAME, start, end))
def test_bad_query_with_start(self, start, end): reader = fasta.InMemoryRefReader([('1', 10, 'ACGT')]) with self.assertRaises(ValueError): reader.query(ranges.make_range('1', start, end))
def test_make_ref_reader_cache_specified(self, fasta_filename): fasta_path = test_utils.genomics_core_testdata(fasta_filename) with fasta.RefFastaReader(fasta_path, cache_size=10) as reader: self.assertEqual(reader.query(ranges.make_range('chrM', 1, 5)), 'ATCA')
def make_test_aligner(self, ref_seq=None, region=None): config = realigner_pb2.RealignerOptions.AlignerOptions( match=1, mismatch=1, gap_open=2, gap_extend=1, k=3, error_rate=.02) ref_seq = ref_seq or 'AAAAAAA' region = region or ranges.make_range('ref', 10, 10 + len(ref_seq)) return aligner.Aligner(config, region, ref_seq)