def test_label_variant_raises_for_non_confident_variant(self): label = variant_labeler.VariantLabel(is_confident=False, variant=test_utils.make_variant( start=10, alleles=['A', 'C']), genotype=(0, 1)) example = self._example_for_variant(label.variant) with six.assertRaisesRegex( self, ValueError, 'Cannot add a non-confident label to an example'): self.processor.add_label_to_example(example, label)
def _label_grouped_variants(self, variants): # redacted # redacted # they should be computed in the grouping. span = ranges.span([variant_utils.variant_range(v) for v in variants]) truths = list( self._get_truth_variants( ranges.expand(span, _TRUTH_VARIANTS_QUERY_REGION_EXPANSION_IN_BP))) if len(truths) > self.max_group_size: logging.warning(( 'Found a large number of variants to label (n_candidates=%d, ' 'n_truth=%d) relative to candidate cap of %d. This may make the ' 'algorithm very slow.'), len(variants), len(truths), self.max_group_size) # redacted logging.warning( 'Returning all variants with not-confident markers.') for variant in variants: yield variant_labeler.VariantLabel(is_confident=False, genotype=(-1, -1), variant=variant) return ref = self.make_labeler_ref(variants, truths) labeled_variants = label_variants(variants, truths, ref) if not labeled_variants: raise ValueError('Failed to assign labels for variants', variants) else: for labeled in labeled_variants: yield variant_labeler.VariantLabel( # redacted # now. Rethink how we establish a variant is confident. Seems like # it'd be confident if it has a non-ref genotype (as we only # consider confident truth variants) or if it overlaps the confident # regions. is_confident=self._confident_regions.variant_overlaps( labeled), genotype=tuple(labeled.calls[0].genotype), variant=labeled)
def label_variants(self, variants): for variant in variants: is_confident, truth_variant = self._match(variant) genotype = None if truth_variant is not None: genotype = _genotype_from_matched_truth(variant, truth_variant) yield variant_labeler.VariantLabel(is_confident=is_confident, variant=variant, genotype=genotype)
def label_variants(self, variants, region=None): for variant in variants: is_confident, truth_variant = self._match( variant_utils.unphase_all_genotypes(variant)) genotype = None if truth_variant is not None: genotype = _genotype_from_matched_truth(variant, truth_variant) yield variant_labeler.VariantLabel( is_confident=is_confident, variant=variant, genotype=genotype)
def label_variants(self, variants, region): # Grab our truth variants and group up variants + truth into small enough # chunks that we can safely send them into our find_best_matching_haplotypes # function. truths = list(self._get_truth_variants(region)) if truths: # Filter out homozygous reference labels. truths = [ y for x, y in zip( map(lambda x: sum(x) > 0, _variant_genotypes(truths)), truths) if x ] grouped = group_variants( candidates=list(variants), truths=truths, max_group_size=self.max_group_size, max_separation=self.max_separation, max_gt_options_product=self.max_gt_options_product) # Now loop over our grouped variants, labeling them, and yielding # VariantLabel objects. for candidates_group, truth_group in grouped: assert len(candidates_group) <= self.max_group_size assert len(truth_group) <= self.max_group_size ref = self.make_labeler_ref(candidates_group, truth_group) labeling = find_best_matching_haplotypes(candidates_group, truth_group, ref) if labeling is None: # Note this test must be 'is None' since label_variants can return an # empty list. raise ValueError('Failed to assign labels for variants', candidates_group, truth_group, ref) self._update_metrics(labeling) for labeled in labeling.candidates_with_assigned_genotypes(): # This logic doesn't make a huge amount of sense when you are doing # haplotype-based labeling. Currently we only say a variant is confident # if it overlaps the confident regions, which is the baseline behavior. # However, it may be useful to rethink how we establish a variant is # confident, as the "event" may be within the confident regions but # shifted outside due to differences in representational choices. Seems # like another approach would be to assign confidence if it has a # non-ref genotype (as we only consider confident truth variants) or if # it overlaps the confident regions. yield variant_labeler.VariantLabel( is_confident=self._confident_regions.variant_overlaps( labeled), genotype=tuple(labeled.calls[0].genotype), variant=labeled)
def test_genotype_from_matched_truth(self, variant_alleles, alt_alleles, truth_alleles, truth_gt, expected_genotype, expected_label): variant = test_utils.make_variant(start=10, alleles=variant_alleles) truth_variant = test_utils.make_variant( start=10, alleles=truth_alleles, gt=truth_gt) self.assertEqual(expected_genotype, variant_labeler._genotype_from_matched_truth( variant, truth_variant)) labeled = variant_labeler.VariantLabel( is_confident=True, variant=variant, genotype=expected_genotype) indices = [variant_alleles.index(alt) - 1 for alt in alt_alleles] self.assertEqual(labeled.label_for_alt_alleles(indices), expected_label)
class RegionProcessorTest(parameterized.TestCase): def setUp(self): super(RegionProcessorTest, self).setUp() self._saved_flags = flagsaver.save_flag_values() self.region = ranges.parse_literal('chr20:10,000,000-10,000,100') FLAGS.reads = '' self.options = make_examples.default_options(add_flags=False) self.options.reference_filename = testdata.CHR20_FASTA main_sample = self.options.sample_options[0] if not main_sample.reads_filenames: main_sample.reads_filenames.append(testdata.CHR20_BAM) main_sample.variant_caller_options.sample_name = 'sample_id' main_sample.name = 'sample_id' self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF self.options.mode = deepvariant_pb2.MakeExamplesOptions.TRAINING self.processor = make_examples_core.RegionProcessor(self.options) self.ref_reader = fasta.IndexedFastaReader( self.options.reference_filename) self.mock_init = self.add_mock('initialize') for sample in self.processor.samples: sample.in_memory_sam_reader = mock.Mock() self.default_shape = [5, 5, 7] self.default_format = 'raw' def tearDown(self): super(RegionProcessorTest, self).tearDown() flagsaver.restore_flag_values(self._saved_flags) def add_mock(self, name, retval='dontadd', side_effect='dontadd'): patcher = mock.patch.object(self.processor, name, autospec=True) self.addCleanup(patcher.stop) mocked = patcher.start() if retval != 'dontadd': mocked.return_value = retval if side_effect != 'dontadd': mocked.side_effect = side_effect return mocked def test_on_demand_initialization_called_if_not_initialized(self): candidates = ['Candidates'] self.assertFalse(self.processor.initialized) main_sample = self.processor.samples[0] mock_rr = self.add_mock('region_reads', retval=[]) mock_cir = self.add_mock('candidates_in_region', retval=({ 'main_sample': candidates }, { 'main_sample': [] })) mock_lc = self.add_mock('label_candidates', retval=[]) self.processor.process(self.region) test_utils.assert_called_once_workaround(self.mock_init) mock_rr.assert_called_once_with( region=self.region, sam_readers=None, reads_filenames=main_sample.options.reads_filenames) main_sample.in_memory_sam_reader.replace_reads.assert_called_once_with( []) mock_cir.assert_called_once_with(self.region) mock_lc.assert_called_once_with(candidates, self.region) def test_on_demand_initialization_not_called_if_initialized(self): self.processor.initialized = True self.assertTrue(self.processor.initialized) main_sample = self.processor.samples[0] mock_rr = self.add_mock('region_reads', retval=[]) mock_cir = self.add_mock('candidates_in_region', retval=({ 'main_sample': [] }, { 'main_sample': [] })) mock_lc = self.add_mock('label_candidates', retval=[]) self.processor.process(self.region) test_utils.assert_not_called_workaround(self.mock_init) mock_rr.assert_called_once_with( region=self.region, sam_readers=None, reads_filenames=main_sample.options.reads_filenames) main_sample.in_memory_sam_reader.replace_reads.assert_called_once_with( []) mock_cir.assert_called_once_with(self.region) test_utils.assert_called_once_workaround(mock_lc) def test_process_calls_no_candidates(self): main_sample = self.processor.samples[0] mock_rr = self.add_mock('region_reads', retval=[]) mock_cir = self.add_mock('candidates_in_region', retval=({ 'main_sample': [] }, { 'main_sample': [] })) mock_cpe = self.add_mock('create_pileup_examples', retval=[]) mock_lc = self.add_mock('label_candidates') candidates, examples, gvcfs, runtimes = self.processor.process( self.region) self.assertEmpty(candidates['main_sample']) self.assertEmpty(examples['main_sample']) self.assertEmpty(gvcfs['main_sample']) self.assertIsInstance(runtimes, dict) mock_rr.assert_called_once_with( region=self.region, sam_readers=None, reads_filenames=main_sample.options.reads_filenames) main_sample.in_memory_sam_reader.replace_reads.assert_called_once_with( []) mock_cir.assert_called_once_with(self.region) test_utils.assert_not_called_workaround(mock_cpe) mock_lc.assert_called_once_with([], self.region) @parameterized.parameters([ deepvariant_pb2.MakeExamplesOptions.TRAINING, deepvariant_pb2.MakeExamplesOptions.CALLING ]) def test_process_calls_with_candidates(self, mode): self.processor.options.mode = mode main_sample = self.processor.samples[0] mock_read = mock.MagicMock() mock_candidate = mock.MagicMock() mock_example = mock.MagicMock() mock_label = mock.MagicMock() mock_rr = self.add_mock('region_reads', retval=[mock_read]) mock_cir = self.add_mock('candidates_in_region', retval=({ 'main_sample': [mock_candidate] }, { 'main_sample': [] })) mock_cpe = self.add_mock('create_pileup_examples', retval=[mock_example]) mock_lc = self.add_mock('label_candidates', retval=[(mock_candidate, mock_label)]) mock_alte = self.add_mock('add_label_to_example', retval=mock_example) candidates, examples, gvcfs, runtimes = self.processor.process( self.region) self.assertEqual(candidates['main_sample'], [mock_candidate]) self.assertEqual(examples['main_sample'], [mock_example]) self.assertEmpty(gvcfs['main_sample']) self.assertIsInstance(runtimes, dict) mock_rr.assert_called_once_with( region=self.region, sam_readers=None, reads_filenames=main_sample.options.reads_filenames) main_sample.in_memory_sam_reader.replace_reads.assert_called_once_with( [mock_read]) mock_cir.assert_called_once_with(self.region) mock_cpe.assert_called_once_with(mock_candidate, sample_order=[0]) if mode == deepvariant_pb2.MakeExamplesOptions.TRAINING: mock_lc.assert_called_once_with([mock_candidate], self.region) mock_alte.assert_called_once_with(mock_example, mock_label) else: # In training mode we don't label our candidates. test_utils.assert_not_called_workaround(mock_lc) test_utils.assert_not_called_workaround(mock_alte) @parameterized.parameters([ deepvariant_pb2.MakeExamplesOptions.TRAINING, deepvariant_pb2.MakeExamplesOptions.CALLING ]) def test_process_keeps_ordering_of_candidates_and_examples(self, mode): self.processor.options.mode = mode r1, r2 = mock.Mock(), mock.Mock() c1, c2 = mock.Mock(), mock.Mock() l1, l2 = mock.Mock(), mock.Mock() e1, e2, e3 = mock.Mock(), mock.Mock(), mock.Mock() main_sample = self.processor.samples[0] self.add_mock('region_reads', retval=[r1, r2]) self.add_mock('candidates_in_region', retval=({ 'main_sample': [c1, c2] }, { 'main_sample': [] })) mock_cpe = self.add_mock('create_pileup_examples', side_effect=[[e1], [e2, e3]]) mock_lc = self.add_mock('label_candidates', retval=[(c1, l1), (c2, l2)]) mock_alte = self.add_mock('add_label_to_example', side_effect=[e1, e2, e3]) candidates, examples, gvcfs, runtimes = self.processor.process( self.region) self.assertEqual(candidates['main_sample'], [c1, c2]) self.assertEqual(examples['main_sample'], [e1, e2, e3]) self.assertEmpty(gvcfs['main_sample']) self.assertIsInstance(runtimes, dict) main_sample.in_memory_sam_reader.replace_reads.assert_called_once_with( [r1, r2]) # We don't try to label variants when in calling mode. self.assertEqual( [mock.call(c1, sample_order=[0]), mock.call(c2, sample_order=[0])], mock_cpe.call_args_list) if mode == deepvariant_pb2.MakeExamplesOptions.CALLING: # In calling mode, we never try to label. test_utils.assert_not_called_workaround(mock_lc) test_utils.assert_not_called_workaround(mock_alte) else: mock_lc.assert_called_once_with([c1, c2], self.region) self.assertEqual([ mock.call(e1, l1), mock.call(e2, l2), mock.call(e3, l2), ], mock_alte.call_args_list) def test_process_with_realigner(self): self.processor.options.mode = deepvariant_pb2.MakeExamplesOptions.CALLING self.processor.options.realigner_enabled = True self.processor.options.realigner_options.CopyFrom( realigner_pb2.RealignerOptions()) self.processor.realigner = mock.Mock() self.processor.realigner.realign_reads.return_value = [], [] main_sample = self.processor.samples[0] main_sample.sam_readers = [mock.Mock()] main_sample.sam_readers[0].query.return_value = [] c1, c2 = mock.Mock(), mock.Mock() e1, e2, e3 = mock.Mock(), mock.Mock(), mock.Mock() self.add_mock('candidates_in_region', retval=({ 'main_sample': [c1, c2] }, { 'main_sample': [] })) mock_cpe = self.add_mock('create_pileup_examples', side_effect=[[e1], [e2, e3]]) mock_lc = self.add_mock('label_candidates') candidates, examples, gvcfs, runtimes = self.processor.process( self.region) self.assertEqual(candidates['main_sample'], [c1, c2]) self.assertEqual(examples['main_sample'], [e1, e2, e3]) self.assertEmpty(gvcfs['main_sample']) self.assertIsInstance(runtimes, dict) main_sample.sam_readers[0].query.assert_called_once_with(self.region) self.processor.realigner.realign_reads.assert_called_once_with( [], self.region) main_sample.in_memory_sam_reader.replace_reads.assert_called_once_with( []) self.assertEqual( [mock.call(c1, sample_order=[0]), mock.call(c2, sample_order=[0])], mock_cpe.call_args_list) test_utils.assert_not_called_workaround(mock_lc) def test_candidates_in_region_no_reads(self): main_sample = self.processor.samples[0] main_sample.in_memory_sam_reader.query.return_value = [] mock_ac = self.add_mock('_make_allele_counter_for_region') self.assertEqual(({}, {}), self.processor.candidates_in_region(self.region)) main_sample.in_memory_sam_reader.query.assert_called_once_with( self.region) # A region with no reads should return out without making an AlleleCounter. test_utils.assert_not_called_workaround(mock_ac) @parameterized.parameters(True, False) def test_candidates_in_region(self, include_gvcfs): self.options.gvcf_filename = 'foo.vcf' if include_gvcfs else '' main_sample = self.processor.samples[0] reads = ['read1', 'read2'] main_sample.in_memory_sam_reader.query.return_value = reads # Setup our make_allele_counter and other mocks. mock_ac = mock.Mock() mock_make_ac = self.add_mock('_make_allele_counter_for_region', retval=mock_ac) # Setup our make_variant_caller and downstream mocks. mock_vc = mock.Mock() mock_vc.calls_and_gvcfs.return_value = ([ 'variant' ], ['gvcf'] if include_gvcfs else []) main_sample.variant_caller = mock_vc actual = self.processor.candidates_in_region(self.region) # Make sure we're getting our reads for the region. main_sample.in_memory_sam_reader.query.assert_called_once_with( self.region) # Make sure we're creating an AlleleCounter once and adding each of our # reads to it. mock_make_ac.assert_called_once_with(self.region, []) self.assertEqual([mock.call(r, 'sample_id') for r in reads], mock_ac.add.call_args_list) # Make sure we call CallVariant for each of the counts returned by the # allele counter. include_med_dp = False mock_vc.calls_and_gvcfs.assert_called_once_with( allele_counters={'sample_id': mock_ac}, target_sample='sample_id', include_gvcfs=include_gvcfs, include_med_dp=include_med_dp) # Finally, our actual result should be the single 'variant' and potentially # the gvcf records, each organized by sample. expected_output = ({ 'main_sample': ['variant'] }, { 'main_sample': ['gvcf'] if include_gvcfs else [] }) self.assertEqual(expected_output, actual) def test_create_pileup_examples_handles_none(self): self.processor.pic = mock.Mock() self.processor.pic.get_reads.return_value = [] dv_call = mock.Mock() self.processor.pic.create_pileup_images.return_value = None self.assertEqual([], self.processor.create_pileup_examples(dv_call)) self.processor.pic.create_pileup_images.assert_called_once_with( dv_call=dv_call, reads_for_samples=[[]], haplotype_alignments_for_samples=None, haplotype_sequences=None, sample_order=None) def test_create_pileup_examples(self): self.processor.pic = mock.Mock() self.processor.pic.get_reads.return_value = [] self.add_mock('_encode_tensor', side_effect=[(six.b('tensor1'), self.default_shape, self.default_format), (six.b('tensor2'), self.default_shape, self.default_format)]) dv_call = mock.Mock() dv_call.variant = test_utils.make_variant(start=10, alleles=['A', 'C', 'G']) ex = mock.Mock() alt1, alt2 = ['C'], ['G'] self.processor.pic.create_pileup_images.return_value = [ (alt1, six.b('tensor1')), (alt2, six.b('tensor2')) ] actual = self.processor.create_pileup_examples(dv_call) self.processor.pic.create_pileup_images.assert_called_once_with( dv_call=dv_call, reads_for_samples=[[]], haplotype_alignments_for_samples=None, haplotype_sequences=None, sample_order=None) self.assertLen(actual, 2) for ex, (alt, img) in zip(actual, [(alt1, six.b('tensor1')), (alt2, six.b('tensor2'))]): self.assertEqual(tf_utils.example_alt_alleles(ex), alt) self.assertEqual(tf_utils.example_variant(ex), dv_call.variant) self.assertEqual(tf_utils.example_encoded_image(ex), img) self.assertEqual(tf_utils.example_image_shape(ex), self.default_shape) self.assertEqual(tf_utils.example_image_format(ex), six.b(self.default_format)) @parameterized.parameters( # Test that a het variant gets a label value of 1 assigned to the example. dict( label=variant_labeler.VariantLabel(is_confident=True, variant=test_utils.make_variant( start=10, alleles=['A', 'C']), genotype=(0, 1)), expected_label_value=1, ), # Test that a reference variant gets a label value of 0 in the example. dict( label=variant_labeler.VariantLabel(is_confident=True, variant=test_utils.make_variant( start=10, alleles=['A', '.']), genotype=(0, 0)), expected_label_value=0, ), ) def test_add_label_to_example(self, label, expected_label_value): example = self._example_for_variant(label.variant) labeled = copy.deepcopy(example) actual = self.processor.add_label_to_example(labeled, label) # The add_label_to_example command modifies labeled and returns it. self.assertIs(actual, labeled) # Check that all keys from example are present in labeled. for key, value in example.features.feature.items(): if key != 'variant/encoded': # Special case tested below. self.assertEqual(value, labeled.features.feature[key]) # The genotype of our example_variant should be set to the true genotype # according to our label. self.assertEqual(expected_label_value, tf_utils.example_label(labeled)) labeled_variant = tf_utils.example_variant(labeled) call = variant_utils.only_call(labeled_variant) self.assertEqual(tuple(call.genotype), label.genotype) # The original variant and labeled_variant from out tf.Example should be # equal except for the genotype field, since this is set by # add_label_to_example. label.variant.calls[0].genotype[:] = [] call.genotype[:] = [] self.assertEqual(label.variant, labeled_variant) def test_label_variant_raises_for_non_confident_variant(self): label = variant_labeler.VariantLabel(is_confident=False, variant=test_utils.make_variant( start=10, alleles=['A', 'C']), genotype=(0, 1)) example = self._example_for_variant(label.variant) with six.assertRaisesRegex( self, ValueError, 'Cannot add a non-confident label to an example'): self.processor.add_label_to_example(example, label) def _example_for_variant(self, variant): return tf_utils.make_example(variant, list(variant.alternate_bases), six.b('foo'), self.default_shape, self.default_format) @parameterized.parameters('sort_by_haplotypes', 'use_original_quality_scores') def test_flags_strictly_needs_sam_aux_fields( self, flags_strictly_needs_sam_aux_fields): FLAGS.mode = 'calling' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.examples = 'examples.tfrecord' FLAGS[flags_strictly_needs_sam_aux_fields].value = True FLAGS.parse_sam_aux_fields = False with six.assertRaisesRegex( self, Exception, 'If --{} is set then --parse_sam_aux_fields must be set too.'. format(flags_strictly_needs_sam_aux_fields)): make_examples.default_options(add_flags=True) @parameterized.parameters( ('add_hp_channel', True, None), ('add_hp_channel', False, 'Note that --{} is set but --parse_sam_aux_fields is not set.'), ('add_hp_channel', None, 'Because --{}=true, --parse_sam_aux_fields is set to true to enable ' 'reading auxiliary fields from reads.'), ) def test_flag_optionally_needs_sam_aux_fields_with_different_parse_sam_aux_fields( self, flag_optionally_needs_sam_aux_fields, parse_sam_aux_fields, expected_message): FLAGS.mode = 'calling' FLAGS.ref = testdata.CHR20_FASTA FLAGS.reads = testdata.CHR20_BAM FLAGS.examples = 'examples.tfrecord' FLAGS[flag_optionally_needs_sam_aux_fields].value = True FLAGS.parse_sam_aux_fields = parse_sam_aux_fields with self.assertLogs() as logs: make_examples.default_options(add_flags=True) aux_fields_log_messages = [ x for x in logs.output if '--parse_sam_aux_fields' in x ] if aux_fields_log_messages: self.assertRegex( aux_fields_log_messages[0], expected_message.format(flag_optionally_needs_sam_aux_fields)) else: self.assertEmpty(aux_fields_log_messages) @parameterized.parameters( [ dict(window_width=221), dict(window_width=1001), ], ) def test_align_to_all_haplotypes(self, window_width): # align_to_all_haplotypes() will pull from the reference, so choose a # real variant. region = ranges.parse_literal('chr20:10,046,000-10,046,400') nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) nist_variants = list(nist_reader.query(region)) # We picked this region to have exactly one known variant: # reference_bases: "AAGAAAGAAAG" # alternate_bases: "A", a deletion of 10 bp # start: 10046177 # end: 10046188 # reference_name: "chr20" variant = nist_variants[0] self.processor.pic = mock.Mock() self.processor.pic.width = window_width self.processor.pic.half_width = int((self.processor.pic.width - 1) / 2) self.processor.realigner = mock.Mock() # Using a real ref_reader to test that the reference allele matches # between the variant and the reference at the variant's coordinates. self.processor.realigner.ref_reader = self.ref_reader read = test_utils.make_read('A' * 101, start=10046100, cigar='101M', quals=[30] * 101) self.processor.realigner.align_to_haplotype = mock.Mock() alt_info = self.processor.align_to_all_haplotypes(variant, [read]) hap_alignments = alt_info['alt_alignments'] hap_sequences = alt_info['alt_sequences'] # Both outputs are keyed by alt allele. self.assertCountEqual(hap_alignments.keys(), ['A']) self.assertCountEqual(hap_sequences.keys(), ['A']) # Sequence must be the length of the window. self.assertLen(hap_sequences['A'], self.processor.pic.width) # align_to_haplotype should be called once for each alt (1 alt here). self.processor.realigner.align_to_haplotype.assert_called_once() # If variant reference_bases are wrong, it should raise a ValueError. variant.reference_bases = 'G' with six.assertRaisesRegex( self, ValueError, 'does not match the bases in the reference'): self.processor.align_to_all_haplotypes(variant, [read])