def test_query_raises_with_bad_range(self): with sam_reader.SamReader.from_file(self.bam, self.indexed_options) as reader: with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'): reader.query(ranges.parse_literal('XXX:1-10')) with self.assertRaisesRegexp(ValueError, 'unknown reference interval'): reader.query(ranges.parse_literal('chr20:10-5'))
def test_bam_query(self): reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertIsInstance(iterable, clif_postproc.WrappedCppIterable) self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def test_sam_query(self): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam')) expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106), (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)] with reader: for interval, n_expected in expected: with reader.query(interval) as iterable: self.assertEqual(test_utils.iterable_len(iterable), n_expected)
def test_query_raises_with_bad_range(self): with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'): self.samples_reader.query(ranges.parse_literal('XXX:1-10')) with self.assertRaisesRegexp(ValueError, 'Malformed region'): self.samples_reader.query(ranges.parse_literal('chr1:0-5')) with self.assertRaisesRegexp(ValueError, 'Malformed region'): self.samples_reader.query(ranges.parse_literal('chr1:6-5')) with self.assertRaisesRegexp(ValueError, 'Malformed region'): self.samples_reader.query(ranges.parse_literal('chr1:10-5'))
def test_query_on_unindexed_reader_raises(self): window = ranges.parse_literal('chr1:10,000,000-10,000,100') unindexed_file = test_utils.genomics_core_testdata('test_samples.vcf') with vcf_reader.VcfReader.from_file(unindexed_file, self.options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query(window)
def test_realigner_diagnostics(self, enabled, emit_reads): # Make sure that by default we aren't emitting any diagnostic outputs. dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format( enabled, emit_reads)) region_str = 'chr20:10046178-10046188' region = ranges.parse_literal(region_str) assembled_region_str = 'chr20:10046096-10046267' reads, header = _get_reads_and_header(region) self.config = realigner.realigner_config(FLAGS) self.config.diagnostics.enabled = enabled self.config.diagnostics.output_root = dx_dir self.config.diagnostics.emit_realigned_reads = emit_reads self.reads_realigner = realigner.Realigner(self.config, self.ref_reader, header) _, _ = self.reads_realigner.realign_reads(reads, region) self.reads_realigner.diagnostic_logger.close( ) # Force close all resources. if not enabled: # Make sure our diagnostic output isn't emitted. self.assertFalse(tf.io.gfile.exists(dx_dir)) else: # Our root directory exists. self.assertTrue(tf.io.gfile.isdir(dx_dir)) # We expect a realigner_metrics.csv in our rootdir with 1 entry in it. metrics_file = os.path.join( dx_dir, self.reads_realigner.diagnostic_logger.metrics_filename) self.assertTrue(tf.io.gfile.exists(metrics_file)) with tf.io.gfile.GFile(metrics_file) as fin: rows = list(csv.DictReader(fin)) self.assertLen(rows, 1) self.assertEqual(set(rows[0].keys()), {'window', 'k', 'n_haplotypes', 'time'}) self.assertEqual(rows[0]['window'], assembled_region_str) self.assertEqual(int(rows[0]['k']), 25) self.assertTrue(int(rows[0]['n_haplotypes']), 2) # Check that our runtime is reasonable (greater than 0, less than 10 s). self.assertTrue(0.0 < float(rows[0]['time']) < 10.0) # As does the subdirectory for this region. region_subdir = os.path.join(dx_dir, assembled_region_str) self.assertTrue(tf.io.gfile.isdir(region_subdir)) # We always have a graph.dot self.assertTrue( tf.io.gfile.exists( os.path.join( region_subdir, self.reads_realigner.diagnostic_logger. graph_filename))) reads_file = os.path.join( dx_dir, region_str, self.reads_realigner.diagnostic_logger. realigned_reads_filename) # if emit_reads=False then file should not exist and vice versa. self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
def test_ops_on_closed_reader_raise(self): reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) with reader: pass # At this point the reader is closed. with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'): reader.iterate() with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'): reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
def test_ops_on_closed_reader_raise(self): reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) with reader: pass # At this point the reader is closed. with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'): reader.iterate() with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'): reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
def test_fail_multiple_concurrent_iterations(self): range1 = ranges.parse_literal('chr3:100,000-500,000') reads = self.samples_reader.query(range1) for read in reads: pass r2 = self.samples_reader.query(range1) with self.assertRaisesRegexp(ValueError, 'No underlying iterable. This '): next(r2)
def test_vcf_query(self): tabix.build_index(self.output_file) self.input_reader = vcf.VcfReader(self.input_file) self.output_reader = vcf.VcfReader(self.output_file) range1 = ranges.parse_literal('chr3:100,000-500,000') self.assertEqual( list(self.input_reader.query(range1)), list(self.output_reader.query(range1)))
def test_make_examples_with_allele_frequency(self, mode): FLAGS.mode = 'calling' FLAGS.ref = testdata.GRCH38_FASTA FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM num_shards = 1 FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) region = ranges.parse_literal('chr20:61001-62000') FLAGS.use_allele_frequency = True FLAGS.regions = [ranges.to_literal(region)] if mode == 'one vcf': FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21 elif mode == 'two vcfs': FLAGS.population_vcfs = ' '.join( [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21]) else: raise ValueError('Invalid mode for parameterized test.') options = make_examples.default_options(add_flags=True) # Run make_examples with the flags above. make_examples_core.make_examples_runner(options) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) # Pileup images should have one extra channel. self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1], decode_example(examples[0])['image/shape']) # Test there is something in the added channel. # Values capture whether each loci has been seen in the observed examples. population_matched_loci = { 'chr20:61539_A': False, 'chr20:61634_G': False, 'chr20:61644_G': False } for example in examples: locus_id = vis.locus_id_from_variant(vis.variant_from_example(example)) if locus_id in population_matched_loci.keys(): channels = vis.channels_from_example(example) self.assertGreater( np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]), 0, msg='There should be ' 'something in the %s-th channel for variant ' '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id)) population_matched_loci[locus_id] = True self.assertTrue( all(population_matched_loci.values()), msg='Check that all ' '3 sample loci appeared in the examples.') # Check against the golden file (same for both modes). golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES) examples_from_golden = list(tfrecord.read_tfrecords(golden_file)) self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
def test_query_without_index_raises(self, unindexed_file_name): path = test_utils.genomics_core_testdata(unindexed_file_name) window = ranges.parse_literal('chr20:10,000,000-10,000,100') with sam_reader.SamReader.from_file(reads_path=path, ref_path='', options=self.options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query(window)
def test_ops_on_closed_reader_raise(self): with self.samples_reader: pass # At this point the reader is closed. with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'): self.samples_reader.iterate() with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'): self.samples_reader.query( ranges.parse_literal('chr1:10,000,000-10,000,100'))
def test_context_manager(self): """Test that we can use context manager to do two queries in sequence.""" reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options) region = ranges.parse_literal('chr20:10,000,000-10,000,100') with reader: with reader.query(region) as query_iterable1: self.assertIsNotNone(query_iterable1) self.assertIsInstance(query_iterable1, clif_postproc.WrappedCppIterable) with reader.query(region) as query_iterable2: self.assertIsNotNone(query_iterable2) self.assertIsInstance(query_iterable2, clif_postproc.WrappedCppIterable)
def test_straightforward_region(self): ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,000,000-10,000,100') ref_seq = ref_reader.query(region) all_reads = list(bam_reader.query(region)) dbg30 = debruijn_graph.build(ref_seq, all_reads, self.single_k_dbg_options(30)) self.assertIsNotNone(dbg30) self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
def test_straightforward_region(self): ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,000,000-10,000,100') ref_seq = ref_reader.query(region) all_reads = list(bam_reader.query(region)) dbg30 = debruijn_graph.build(ref_seq, all_reads, self.single_k_dbg_options(30)) self.assertIsNotNone(dbg30) self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl, eval_region, output_report_csv): """Outputs precision-recall for a sklearn model using AlleleCount features. Args: truth_variants: path to the VCF. reads: path to the reads BAM. ref: path to the reference FASTA. input_model_pckl: path to read the LogisticRegression pickle from. eval_region: str, region to evaluate on in the 'chr:start-end', 'chr:position' or 'chr' format. output_report_csv: path to the output report csv. Raises: ValueError: if eval_region cannot be parsed. """ sam_reader = sam.SamReader(reads) ref_reader = fasta.IndexedFastaReader(ref) read_reqs = reads_pb2.ReadRequirements( min_base_quality=10, min_mapping_quality=10, min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT) allele_counter_options = deepvariant_pb2.AlleleCounterOptions( partition_size=1, read_requirements=read_reqs) model = joblib.load(input_model_pckl) with vcf.VcfReader(truth_variants) as vcf_reader: region = ranges.parse_literal(eval_region, contig_map=ranges.contigs_dict( ref_reader.header.contigs)) true_indels = [ var for var in vcf_reader.query(region) if (variant_utils.is_indel(var)) ] precisions = compute_precision(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS, region) recalls = compute_effective_recall(model, true_indels, sam_reader, ref_reader, allele_counter_options, _THRESHOLDS) with tf.gfile.GFile(output_report_csv, 'w') as csvfile: fieldnames = ['threshold', 'precision', 'recall'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for threshold in _THRESHOLDS: writer.writerow({ 'threshold': threshold, 'precision': precisions[threshold], 'recall': recalls[threshold] })
def test_context_manager(self): """Test that we can use context manager to do two queries in sequence.""" reader = sam_reader.SamReader.from_file( reads_path=self.bam, ref_path='', options=self.options) region = ranges.parse_literal('chr20:10,000,000-10,000,100') with reader: with reader.query(region) as query_iterable1: self.assertIsNotNone(query_iterable1) self.assertIsInstance(query_iterable1, clif_postproc.WrappedCppIterable) with reader.query(region) as query_iterable2: self.assertIsNotNone(query_iterable2) self.assertIsInstance(query_iterable2, clif_postproc.WrappedCppIterable)
def test_downsampling(self, method, maybe_range, fraction, expected_n_reads): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam'), downsample_fraction=fraction, random_seed=12345) with reader: if method == 'iterate': reads_iter = reader.iterate() elif method == 'query': reads_iter = reader.query(ranges.parse_literal(maybe_range)) else: self.fail('Unexpected method', method) self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
def test_complex_region(self): # There is a heterozygous 9 bp deletion of tandem TGA repeat. # "chr20:10,095,379-10,095,500" ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,095,379-10,095,500') ref_seq = ref_reader.query(region) reads = list(bam_reader.query(region)) dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options()) self.assertIsNotNone(dbg) self.assertEqual(44, dbg.kmer_size) self.assertEqual(2, len(dbg.candidate_haplotypes())) self.assertIn(ref_seq, dbg.candidate_haplotypes())
def test_complex_region(self): # There is a heterozygous 9 bp deletion of tandem TGA repeat. # "chr20:10,095,379-10,095,500" ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA) bam_reader = sam.SamReader(testdata.CHR20_BAM) region = ranges.parse_literal('chr20:10,095,379-10,095,500') ref_seq = ref_reader.query(region) reads = list(bam_reader.query(region)) dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options()) self.assertIsNotNone(dbg) self.assertEqual(44, dbg.kmer_size) self.assertEqual(2, len(dbg.candidate_haplotypes())) self.assertIn(ref_seq, dbg.candidate_haplotypes())
def test_downsampling(self, method, maybe_range, fraction, expected_n_reads): reader = sam.SamReader( test_utils.genomics_core_testdata('test.bam'), downsample_fraction=fraction, random_seed=12345) with reader: if method == 'iterate': reads_iter = reader.iterate() elif method == 'query': reads_iter = reader.query(ranges.parse_literal(maybe_range)) else: self.fail('Unexpected method ' + str(method)) self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
def setUp(self): self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = testdata.CHR20_FASTA self.options.reads_filename = testdata.CHR20_BAM self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING self.processor = make_examples.RegionProcessor(self.options) self.mock_init = self.add_mock('_initialize') self.default_shape = [5, 5, 7] self.default_format = 'raw'
def test_align_to_all_haplotypes(self, window_width): # align_to_all_haplotypes() will pull from the reference, so choose a # real variant. region = ranges.parse_literal('chr20:10,046,000-10,046,400') nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) # We picked this region to have exactly one known variant: # reference_bases: "AAGAAAGAAAG" # alternate_bases: "A", a deletion of 10 bp # start: 10046177 # end: 10046188 # reference_name: "chr20" variant = nist_variants[0] self.processor.pic = mock.Mock() self.processor.pic.width = window_width self.processor.pic.half_width = int((self.processor.pic.width - 1) / 2) self.processor.realigner = mock.Mock() # Using a real ref_reader to test that the reference allele matches # between the variant and the reference at the variant's coordinates. self.processor.realigner.ref_reader = self.ref_reader read = test_utils.make_read('A' * 101, start=10046100, cigar='101M', quals=[30] * 101) self.processor.realigner.align_to_haplotype = mock.Mock() alt_info = self.processor.align_to_all_haplotypes(variant, [read]) hap_alignments = alt_info['alt_alignments'] hap_sequences = alt_info['alt_sequences'] # Both outputs are keyed by alt allele. self.assertCountEqual(hap_alignments.keys(), ['A']) self.assertCountEqual(hap_sequences.keys(), ['A']) # Sequence must be the length of the window. self.assertLen(hap_sequences['A'], self.processor.pic.width) # align_to_haplotype should be called once for each alt (1 alt here). self.processor.realigner.align_to_haplotype.assert_called_once() # If variant reference_bases are wrong, it should raise a ValueError. variant.reference_bases = 'G' with six.assertRaisesRegex( self, ValueError, 'does not match the bases in the reference'): self.processor.align_to_all_haplotypes(variant, [read])
def test_make_examples_end2end_failed_on_cram(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.use_ref_for_cram = False FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_CRAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile( _sharded('failed.examples.tfrecord')) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.gvcf_gq_binsize = 5 options = make_examples.default_options(add_flags=True) with six.assertRaisesRegex(self, ValueError, 'Failed to parse BAM/CRAM file.'): make_examples_core.make_examples_runner(options)
def test_realigner_doesnt_create_invalid_intervals(self): """Tests that read sets don't result in a crash in reference_fai.cc.""" read = test_utils.make_read('ACCGT' * 50, start=63025520 - 250, cigar='250M', quals=range(30, 35) * 50, name='read1') reads = [read] * 20 region = ranges.parse_literal('chr20:63,025,320-63,025,520') self.reads_realigner.realign_reads(reads, region) # These reads are aligned off the edge of the contig. read = test_utils.make_read('TTATA' * 50, start=63025520 - 200, cigar='200M50S', quals=range(30, 35) * 50, name='read1') reads = [read] * 20 self.reads_realigner.realign_reads(reads, region)
def test_make_examples_end2end_confirm_downsample_fraction_used(self): def _get_examples(downsample_fraction=None): if downsample_fraction is not None: FLAGS.downsample_fraction = downsample_fraction options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=False) return examples region = ranges.parse_literal('chr20:10,000,000-10,004,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.mode = 'calling' examples1 = _get_examples() examples2 = _get_examples(0.01) self.assertLess(len(examples2), len(examples1))
def test_catches_bad_flags(self): # Set all of the requested flag values. region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord') FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord') FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF # This is the bad flag. FLAGS.confident_regions = '' with mock.patch.object(logging, 'error') as mock_logging,\ mock.patch.object(sys, 'exit') as mock_exit: make_examples.main(['make_examples.py']) mock_logging.assert_called_once_with( 'confident_regions is required when in training mode.') mock_exit.assert_called_once_with(errno.ENOENT)
def test_make_examples_with_variant_selection(self, select_types, expected_count, keep_legacy_behavior=False): if select_types is not None: FLAGS.select_variant_types = select_types region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) candidates = list(tfrecord.read_tfrecords(FLAGS.candidates)) self.assertLen(candidates, expected_count)
def test_catches_bad_flags(self): # Set all of the requested flag values. region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord') FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord') FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF # This is the bad flag. FLAGS.confident_regions = '' with mock.patch.object(logging, 'error') as mock_logging,\ mock.patch.object(sys, 'exit') as mock_exit: make_examples.main(['make_examples.py']) mock_logging.assert_called_once_with( 'confident_regions is required when in training mode.') mock_exit.assert_called_once_with(errno.ENOENT)
def test_make_examples_end2end_failed_on_mismatched_multi_bam(self): region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.write_run_info = True FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM]) FLAGS.candidates = test_utils.test_tmpfile( _sharded('mismatched_multi_bam.vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile( _sharded('mismatched_multi_bam.examples.tfrecord')) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = 'calling' FLAGS.gvcf_gq_binsize = 5 options = make_examples.default_options(add_flags=True) # This shows an example of what the error message looks like: # redacted with six.assertRaisesRegex( self, ValueError, 'NOT_FOUND: Unknown reference_name ' 'reference_name: "chr20" start: 9999999 end: 10000999'): make_examples_core.make_examples_runner(options)
def test_realigner_doesnt_create_invalid_intervals(self): """Tests that read sets don't result in a crash in reference_fai.cc.""" region = ranges.parse_literal('chr20:63,025,320-63,025,520') reads = [ test_utils.make_read('ACCGT' * 50, start=63025520 - 250, cigar='250M', quals=range(30, 35) * 50) for _ in range(20) ] self.reads_realigner.realign_reads(reads, region) # These reads are aligned off the edge of the contig. Note that the # reference bases in this interval are all Ns as well. reads = [ test_utils.make_read('TTATA' * 50, start=63025520 - 200, cigar='200M50S', quals=range(30, 35) * 50) for _ in range(20) ] self.reads_realigner.realign_reads(reads, region)
def test_get_truth_variants(self): v1 = test_utils.make_variant(chrom='1', start=10) v2 = test_utils.make_variant(chrom='1', start=20) v3_filtered = test_utils.make_variant(chrom='1', start=30, filters=['FAIL']) v4_del = test_utils.make_variant(chrom='1', start=40, alleles=['AAAA', 'A']) v5_non_confident = test_utils.make_variant(chrom='1', start=150) variants = [v1, v2, v3_filtered, v4_del, v5_non_confident] reader = vcf.InMemoryVcfReader(variants=variants) confident_regions = ranges.RangeSet([ranges.make_range('1', 1, 100)]) labeler = PlaceholderVariantLabeler( truth_vcf_reader=reader, confident_regions=confident_regions) # Check that we get v1 and v2 specifically when only they are covered by the # query. self.assertEqual( list(labeler._get_truth_variants(ranges.parse_literal('1:1-15'))), [v1]) self.assertEqual( list(labeler._get_truth_variants(ranges.parse_literal('1:15-25'))), [v2]) # We don't include filtered variants. self.assertEqual( list(labeler._get_truth_variants(ranges.parse_literal('1:25-35'))), []) # Check that we get all overlapping variants of our query. for del_query in ['1:35-45', '1:42-43', '1:38-42', '1:42-50']: self.assertEqual( list( labeler._get_truth_variants( ranges.parse_literal(del_query))), [v4_del]) # Checks that a simple query gets all our non-filtered variants. self.assertEqual( list(labeler._get_truth_variants(ranges.parse_literal('1:1-100'))), [v1, v2, v4_del]) # Even through our query covers v5, it's not confident, so we don't get it. self.assertEqual( list(labeler._get_truth_variants( ranges.parse_literal('1:1-1000'))), [v1, v2, v4_del])
def setUp(self): super(RegionProcessorTest, self).setUp() self._saved_flags = flagsaver.save_flag_values() self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = testdata.CHR20_FASTA main_sample = self.options.sample_options[0] if not main_sample.reads_filenames: main_sample.reads_filenames.append(testdata.CHR20_BAM) main_sample.variant_caller_options.sample_name = 'sample_id' main_sample.name = 'sample_id' self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.MakeExamplesOptions.TRAINING self.processor = make_examples_core.RegionProcessor(self.options) self.ref_reader = fasta.IndexedFastaReader( self.options.reference_filename) self.mock_init = self.add_mock('initialize') for sample in self.processor.samples: sample.in_memory_sam_reader = mock.Mock() self.default_shape = [5, 5, 7] self.default_format = 'raw'
def test_make_examples_training_end2end_with_customized_classes_labeler(self): FLAGS.labeler_algorithm = 'customized_classes_labeler' FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2' FLAGS.customized_classes_labeler_info_field_name = 'type' region = ranges.parse_literal('chr20:10,000,000-10,004,000') FLAGS.regions = [ranges.to_literal(region)] FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord')) FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord')) FLAGS.partition_size = 1000 FLAGS.mode = 'training' FLAGS.gvcf_gq_binsize = 5 FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED options = make_examples.default_options(add_flags=True) make_examples_core.make_examples_runner(options) golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=True) self.assertDeepVariantExamplesEqual( examples, list(tfrecord.read_tfrecords(golden_file)))
def test_vcf_query(self): range1 = ranges.parse_literal('chr3:100,000-500,000') iterable = self.samples_reader.query(range1) self.assertEqual(test_utils.iterable_len(iterable), 4)
def test_parse_literal_bad(self, bad_literal): with self.assertRaises(ValueError): ranges.parse_literal(bad_literal)
def test_query(self, query, expected_variant_indices): range1 = ranges.parse_literal(query, ranges.contigs_dict(self.header.contigs)) self.assertEqual(list(self.reader.query(range1)), [self.variants[i] for i in expected_variant_indices])
def test_query(self, query, expected_variant_indices): range1 = ranges.parse_literal(query, ranges.contigs_dict( self.header.contigs)) self.assertEqual( list(self.reader.query(range1)), [self.variants[i] for i in expected_variant_indices])
def test_query_on_unindexed_reader_raises(self): with vcf_reader.VcfReader.from_file(self.samples_vcf, self.unindexed_options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query(ranges.parse_literal('chr1:10,000,000-10,000,100'))
def test_construction(self): aregion = _test_assembled_region('chr1:1-5', haplotypes=['A', 'C']) self.assertEqual(aregion.region, ranges.parse_literal('chr1:1-5')) self.assertEqual(aregion.haplotypes, ['A', 'C']) self.assertEqual(aregion.reads, [])
def _test_assembled_region(region_str, haplotypes=None): return realigner.AssemblyRegion( realigner_pb2.CandidateHaplotypes( span=ranges.parse_literal(region_str), haplotypes=haplotypes or []))
def test_make_examples_end2end(self, mode, num_shards, labeler_algorithm=None): self.maxDiff = None self.assertIn(mode, {'calling', 'training'}) region = ranges.parse_literal('chr20:10,000,000-10,010,000') FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.candidates = test_utils.test_tmpfile( _sharded('vsc.tfrecord', num_shards)) FLAGS.examples = test_utils.test_tmpfile( _sharded('examples.tfrecord', num_shards)) FLAGS.regions = [ranges.to_literal(region)] FLAGS.partition_size = 1000 FLAGS.mode = mode FLAGS.gvcf_gq_binsize = 5 if labeler_algorithm is not None: FLAGS.labeler_algorithm = labeler_algorithm if mode == 'calling': FLAGS.gvcf = test_utils.test_tmpfile( _sharded('gvcf.tfrecord', num_shards)) else: FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED for task_id in range(max(num_shards, 1)): FLAGS.task = task_id options = make_examples.default_options(add_flags=True) make_examples.make_examples_runner(options) # Test that our candidates are reasonable, calling specific helper functions # to check lots of properties of the output. candidates = sorted( io_utils.read_tfrecords( FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall), key=lambda c: variant_utils.variant_range_tuple(c.variant)) self.verify_deepvariant_calls(candidates, options) self.verify_variants( [call.variant for call in candidates], region, options, is_gvcf=False) # Verify that the variants in the examples are all good. examples = self.verify_examples( FLAGS.examples, region, options, verify_labels=mode == 'training') example_variants = [tf_utils.example_variant(ex) for ex in examples] self.verify_variants(example_variants, region, options, is_gvcf=False) # Verify the integrity of the examples and then check that they match our # golden labeled examples. Note we expect the order for both training and # calling modes to produce deterministic order because we fix the random # seed. if mode == 'calling': golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards) else: golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards) self.assertDeepVariantExamplesEqual( examples, list(io_utils.read_tfrecords(golden_file))) if mode == 'calling': nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) self.verify_nist_concordance(example_variants, nist_variants) # Check the quality of our generated gvcf file. gvcfs = variant_utils.sorted_variants( io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant)) self.verify_variants(gvcfs, region, options, is_gvcf=True) self.verify_contiguity(gvcfs, region) gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT, num_shards) expected_gvcfs = list( io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant)) self.assertItemsEqual(gvcfs, expected_gvcfs)
def test_query_on_unindexed_reader_raises(self): with sam_reader.SamReader.from_file(self.bam, self.options) as reader: with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'): reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))