def testExampleSetTruthVariant(self): example = tf_utils.make_example(self.variant, self.alts, self.encoded_image, self.default_shape, self.default_format) full_tvariant = variants_pb2.Variant( variant_set_id='variant_set_id', id='id', names=['name1'], created=1234, reference_name='1', start=10, end=11, reference_bases='C', alternate_bases=['A'], filter=['PASS'], quality=1234.5, calls=[ variants_pb2.VariantCall(call_set_id='call_set_id', call_set_name='call_set_name', genotype=[0, 1], phaseset='phaseset', genotype_likelihood=[0.1, 0.2, 0.3]) ]) test_utils.set_list_values(full_tvariant.info['key'], [1]) test_utils.set_list_values(full_tvariant.calls[0].info['key'], [2]) simple_tvariant = variants_pb2.Variant( reference_name='1', start=10, end=11, reference_bases='C', alternate_bases=['A'], filter=['PASS'], quality=1234.5, calls=[ variants_pb2.VariantCall(call_set_name='call_set_name', genotype=[0, 1]) ]) test_utils.set_list_values(simple_tvariant.calls[0].info['key'], [2]) self.assertIsNotAFeature('truth_variant/encoded', example) tf_utils.example_set_truth_variant(example, full_tvariant, simplify=False) self.assertEqual(full_tvariant, tf_utils.example_truth_variant(example)) # Check that reencoding with simplify=True produces the simplified version. tf_utils.example_set_truth_variant(example, full_tvariant, simplify=True) self.assertEqual(simple_tvariant, tf_utils.example_truth_variant(example))
def _create_record_from_template(template, start, end): """Returns a copy of the template variant with the new start and end.""" retval = variants_pb2.Variant() retval.CopyFrom(template) retval.start = start retval.end = end return retval
def test_add_call_to_variant(self, probs, expected): raw_variant = variants_pb2.Variant( reference_name=expected.reference_name, reference_bases=expected.reference_bases, alternate_bases=expected.alternate_bases, start=expected.start, end=expected.end, calls=[ variants_pb2.VariantCall(call_set_name=_DEFAULT_SAMPLE_NAME) ]) variant = postprocess_variants.add_call_to_variant( variant=raw_variant, predictions=probs, sample_name=_DEFAULT_SAMPLE_NAME) self.assertEqual(variant.reference_bases, expected.reference_bases) self.assertEqual(variant.alternate_bases, expected.alternate_bases) self.assertEqual(variant.reference_name, expected.reference_name) self.assertEqual(variant.start, expected.start) self.assertEqual(variant.end, expected.end) self.assertAlmostEquals(variant.quality, expected.quality, places=6) self.assertEqual(variant.filter, expected.filter) self.assertEqual(len(variant.calls), 1) self.assertEqual(len(expected.calls), 1) self.assertEqual(variant.calls[0].genotype, expected.calls[0].genotype) self.assertEqual(variant.calls[0].info['GQ'], expected.calls[0].info['GQ']) for gl, expected_gl in zip(variant.calls[0].genotype_likelihood, expected.calls[0].genotype_likelihood): self.assertAlmostEquals(gl, expected_gl, places=6)
def test_compute_filter_fields(self): # This generates too many tests as a parameterized test. for qual, min_qual in itertools.product(range(100), range(100)): # First test with no call and filter threshold variant = variants_pb2.Variant() variant.quality = qual expected = [] expected.append( postprocess_variants.DEEP_VARIANT_PASS if qual >= min_qual else postprocess_variants.DEEP_VARIANT_QUAL_FILTER) self.assertEqual( postprocess_variants.compute_filter_fields(variant, min_qual), expected) # Now add hom ref genotype --> qual shouldn't affect filter field del variant.filter[:] variant.calls.add(genotype=[0, 0]) expected = [] expected.append(postprocess_variants.DEEP_VARIANT_REF_FILTER) self.assertEqual( postprocess_variants.compute_filter_fields(variant, min_qual), expected) # Now add variant genotype --> qual filter should matter again del variant.filter[:] del variant.calls[:] variant.calls.add(genotype=[0, 1]) expected = [] expected.append( postprocess_variants.DEEP_VARIANT_PASS if qual >= min_qual else postprocess_variants.DEEP_VARIANT_QUAL_FILTER) self.assertEqual( postprocess_variants.compute_filter_fields(variant, min_qual), expected)
def test_read_support_is_respected(self, read_name, read_number, alt_allele, read_base, supports_alt): """supports_alt is encoded as the 5th channel out of the 7 channels.""" dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=10, end=11, reference_bases='A', alternate_bases=[alt_allele]), allele_support={ 'C': _supporting_reads('read1/1', 'read3/2'), 'G': _supporting_reads('read2/1', 'read2/2'), }) read = test_utils.make_read(read_base, start=dv_call.variant.start, cigar='1M', quals=[50], name=read_name) read.read_number = read_number actual = _make_encoder().encode_read(dv_call, 'TAT', read, dv_call.variant.start - 1, alt_allele) expected_base_values = {'C': 30, 'G': 180} expected_supports_alt_channel = [152, 254] expected = [ expected_base_values[read_base], 254, 211, 70, expected_supports_alt_channel[supports_alt], 254, 1 ] self.assertEqual(list(actual[0, 1]), expected)
def test_overlaps_variant_with_ranges(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)]) with mock.patch.object(range_set, 'overlaps') as mock_overlaps: mock_overlaps.return_value = True self.assertEqual(range_set.variant_overlaps(variant), True) mock_overlaps.assert_called_once_with('chr2', 10)
def _create_variant_with_alleles(ref=None, alts=None, start=0): """Creates a Variant record with specified alternate_bases.""" return variants_pb2.Variant( reference_bases=ref, alternate_bases=alts, start=start, calls=[variants_pb2.VariantCall(call_set_name=_DEFAULT_SAMPLE_NAME)])
def _make_dv_call(ref_bases='A', alt_bases='C'): return deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=10, end=11, reference_bases=ref_bases, alternate_bases=[alt_bases]), allele_support={'C': _supporting_reads('read1/1', 'read2/1')})
def test_exception_extract_single_variant_name(self, names): variant_calls = [ variants_pb2.VariantCall(call_set_name=name) for name in names ] variant = variants_pb2.Variant(calls=variant_calls) record = deepvariant_pb2.CallVariantsOutput(variant=variant) with self.assertRaisesRegexp(ValueError, 'Error extracting name:'): postprocess_variants._extract_single_sample_name(record)
def test_alt_combinations_no_het_alt(self, ref, alts, expected): options = pileup_image.default_options() options.multi_allelic_mode = ( deepvariant_pb2.PileupImageOptions.NO_HET_ALT_IMAGES) pic = pileup_image.PileupImageCreator(options, self.mock_ref_reader, self.mock_sam_reader) variant = variants_pb2.Variant(reference_bases=ref, alternate_bases=alts) self.assertEqual(expected, list(pic._alt_allele_combinations(variant)))
def setUp(self): self.alts = ['A'] self.variant = variants_pb2.Variant(reference_name='1', start=10, end=11, reference_bases='C', alternate_bases=self.alts) self.encoded_image = 'encoded_image_data' self.default_shape = [5, 5, 7] self.default_format = 'raw'
def test_transform_to_gvcf_no_allele_addition(self, alts, gls, vaf): variant = _create_variant( ref_name='chr1', start=10, ref_base='A', alt_bases=alts, qual=40, filter_field='PASS', genotype=[0, 1], gq=None, likelihoods=gls) vaf_values = [struct_pb2.Value(number_value=v) for v in vaf] variant.calls[0].info['VAF'].values.extend(vaf_values) expected = variants_pb2.Variant() expected.CopyFrom(variant) actual = postprocess_variants._transform_to_gvcf_record(variant) self.assertEqual(actual, expected)
def _simplify_variant(variant): """Returns a new Variant with only the basic fields of variant.""" def _simplify_variant_call(call): """Returns a new VariantCall with the basic fields of call.""" return variants_pb2.VariantCall( call_set_name=call.call_set_name, genotype=call.genotype, info=dict(call.info)) # dict() is necessary to actually set info. return variants_pb2.Variant( reference_name=variant.reference_name, start=variant.start, end=variant.end, reference_bases=variant.reference_bases, alternate_bases=variant.alternate_bases, filter=variant.filter, quality=variant.quality, calls=[_simplify_variant_call(call) for call in variant.calls])
def _make_synthetic_hom_ref(self, variant): """Creates a version of variant with a hom-ref genotype. Args: variant: Our candidate learning.genomics.deepvariant.core.genomics.Variant variant. Returns: A new Variant with the same position and alleles as variant but with a hom-ref genotype. """ return variants_pb2.Variant( reference_name=variant.reference_name, start=variant.start, end=variant.end, reference_bases=variant.reference_bases, alternate_bases=variant.alternate_bases, calls=[variants_pb2.VariantCall(genotype=[0, 0])])
def test_ignores_reads_with_low_quality_bases(self): dv_call = deepvariant_pb2.DeepVariantCall( variant=variants_pb2.Variant(reference_name='chr1', start=2, end=3, reference_bases='A', alternate_bases=['C'])) pie = _make_encoder() # Get the threshold the encoder uses. min_qual = pileup_image.DEFAULT_MIN_BASE_QUALITY for qual in range(0, min_qual + 5): quals = [min_qual - 1, qual, min_qual + 1] read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals) actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C') if qual < min_qual: self.assertIsNone(actual) else: self.assertIsNotNone(actual)
def prune_alleles(variant, alt_alleles_to_remove): """Remove the alt alleles in alt_alleles_to_remove from canonical_variant. Args: variant: variants_pb2.Variant. alt_alleles_to_remove: iterable of str. Alt alleles to remove from variant. Returns: variants_pb2.Variant with the alt alleles removed from alternate_bases. """ # If we aren't removing any alt alleles, just return the unmodified variant. if not alt_alleles_to_remove: return variant new_variant = variants_pb2.Variant() new_variant.CopyFrom(variant) # Cleanup any VariantCall.info fields indexed by alt allele. remapper = AlleleRemapper(variant.alternate_bases, alt_alleles_to_remove) remapper.reindex_allele_indexed_fields(new_variant, _ALT_ALLELE_INDEXED_FORMAT_FIELDS) new_variant.alternate_bases[:] = remapper.retained_alt_alleles() return new_variant
def test_alt_combinations(self, ref, alts, expected): variant = variants_pb2.Variant(reference_bases=ref, alternate_bases=alts) self.assertEqual(expected, list(self.pic._alt_allele_combinations(variant)))
def make_variant(chrom='chr1', start=10, alleles=None, end=None, filters=None, qual=None, gt=None, gq=None, sample_name=None, gls=None): """Creates a new Variant proto from args. Args: chrom: str. The reference_name for this variant. Defaults to 'chr1'. start: int. The starting position of this variant. Defaults to 10. alleles: list of str with at least one element. alleles[0] is the reference bases and alleles[1:] will be set to alternate_bases of variant. If None, defaults to ['A', 'C']. end: int or None. If not None, the variant's end will be set to this value. If None, will be set to the start + len(reference_bases). filters: str, list of str, or None. Sets the filters field of the variant to this value if not None. If filters is a string `value`, this is equivalent to an argument [`value`]. If None, no value will be assigned to the filters field. qual: int or None. The quality score for this variant. If None, no quality score will be written in the Variant. gt: A list of ints, or None. If present, creates a VariantCall in Variant with genotype field set to this value. The special 'DEFAULT' value, if provided, will set the genotype to [0, 1]. This is the default behavior. gq: int or None. If not None and gt is not None, we will add an this GQ value to our VariantCall. sample_name: str or None. If not None and gt is not None, sets the call_set_name of our VariantCall to this value. gls: array-list of float, or None. If not None and gt is not None, sets the genotype_likelihoods of our VariantCall to this value. Returns: learning.genomics.deepvariant.core.genomics.Variant proto. """ if alleles is None: alleles = ['A', 'C'] if not end: end = start + len(alleles[0]) variant = variants_pb2.Variant( reference_name=chrom, start=start, end=end, reference_bases=alleles[0], alternate_bases=alleles[1:], quality=qual, ) if filters is not None: if not isinstance(filters, (list, tuple)): filters = [filters] variant.filter[:] = filters if gt: call = variant.calls.add(genotype=gt) if sample_name: call.call_set_name = sample_name if gq: set_list_values(call.info['GQ'], [gq]) if gls: call.genotype_likelihood.extend(gls) return variant
def test_allele_indices_with_num_alts(self, alt_bases, num_alts, expected): variant = variants_pb2.Variant(alternate_bases=alt_bases) actual = variantutils.allele_indices_with_num_alts(variant, num_alts, ploidy=2) self.assertEqual(actual, expected)
def test_invalid_allele_indices_with_num_alts(self, alt_bases, num_alts, ploidy): variant = variants_pb2.Variant(alternate_bases=alt_bases) with self.assertRaises((NotImplementedError, ValueError)): variantutils.allele_indices_with_num_alts(variant, num_alts, ploidy)
def make_gvcfs(self, allele_count_summaries): """Primary interface function for computing gVCF confidence at a site. Looks at the counts in the provided list of AlleleCountSummary protos and returns properly-formatted Variant protos containing gVCF reference blocks for all sites in allele_count_summaries. The returned Variant has reference_name, start, end are set and contains a single VariantCall in the calls field with call_set_name of options.sample_name, genotypes set to 0/0 (diploid reference), and a GQ value bound in the info field appropriate to the data in allele_count. The provided allele count must have either a canonical DNA sequence base ( A, C, G, T) or be "N". Args: allele_count_summaries: iterable of AlleleCountSummary protos in coordinate-sorted order. Each proto is used to get the read counts for reference and alternate alleles, the reference position, and reference base. Yields: third_party.nucleus.protos.Variant proto in coordinate-sorted order containing gVCF records. """ def with_gq_and_likelihoods(summary_counts): """Returns summary_counts along with GQ and genotype likelihoods. If the reference base is not in CANONICAL_DNA_BASES, both GQ and genotype likelihoods are set to None. Args: summary_counts: A single AlleleCountSummary. Returns: A tuple of summary_counts, quantized GQ, raw GQ, and genotype likelihoods for summary_counts where raw GQ and genotype_likelihood are calculated by self.reference_confidence. Raises: ValueError: The reference base is not a valid DNA or IUPAC base. """ if summary_counts.ref_base not in CANONICAL_DNA_BASES: if summary_counts.ref_base in EXTENDED_IUPAC_CODES: # Skip calculating gq and likelihoods, since this is an ambiguous # reference base. quantized_gq, raw_gq, likelihoods = None, None, None else: raise ValueError( 'Invalid reference base={} found during gvcf ' 'calculation'.format(summary_counts.ref_base)) else: n_ref = summary_counts.ref_supporting_read_count n_total = summary_counts.total_read_count raw_gq, likelihoods = self.reference_confidence(n_ref, n_total) quantized_gq = _quantize_gq(raw_gq, self.options.gq_resolution) return summary_counts, quantized_gq, raw_gq, likelihoods # Combines contiguous, compatible single-bp blocks into larger gVCF blocks, # respecting non-reference variants interspersed among them. Yields each # combined gVCF Variant proto, in order. Compatible right now means that the # blocks to be merged have the same non-None GQ value. for key, combinable in itertools.groupby( (with_gq_and_likelihoods(sc) for sc in allele_count_summaries), key=operator.itemgetter(1)): if key is None: # A None key indicates that a non-DNA reference base was encountered, so # skip this group. continue combinable = list(combinable) min_gq = min(raw_gq_value for _, _, raw_gq_value, _ in combinable) summary_counts, _, _, likelihoods = combinable[0] call = variants_pb2.VariantCall( call_set_name=self.options.sample_name, genotype=[0, 0], genotype_likelihood=likelihoods) variantutils.set_variantcall_gq(call, min_gq) yield variants_pb2.Variant( reference_name=summary_counts.reference_name, reference_bases=summary_counts.ref_base, alternate_bases=[variantutils.GVCF_ALT_ALLELE], start=summary_counts.position, end=combinable[-1][0].position + 1, calls=[call])
def test_overlaps_variant_empty_range(self): variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11) empty_set = ranges.RangeSet() self.assertEqual( empty_set.variant_overlaps(variant, empty_set_return_value='foo'), 'foo')
def _resolve_overlapping_variants(overlapping_variants): """Yields variants with compatible haplotypes, if possible. Args: overlapping_variants: list(Variant). A non-empty list of Variant protos in coordinate-sorted order that overlap on the reference genome and are predicted to contain alternate allele genotypes. Yields: Variant protos in coordinate-sorted order that try to resolve incompatible haplotypes. """ # Short circuit the simplest case: A single variant in a region is compatible # with itself by definition. if len(overlapping_variants) == 1: yield overlapping_variants[0] return # If the actual genotype calls are compatible, we can safely return those # since they would be the most likely configuration also when restricting to # only valid configurations of genotype calls. calculator = _VariantCompatibilityCalculator(overlapping_variants) nonref_counts = [_nonref_genotype_count(v) for v in overlapping_variants] if calculator.all_variants_compatible(nonref_counts): logging.info('Overlapping variants are naturally compatible: %s', overlapping_variants) for variant in overlapping_variants: yield variant return # The actual genotype calls produce an inconsistent haplotype. If the number # of affected variants is "too large", avoid processing since this is an # exponential process. if len(overlapping_variants) > _MAX_OVERLAPPING_VARIANTS_TO_RESOLVE: logging.warning( 'Overlapping variants are not naturally compatible, and there are too ' 'many to exhaustively search (%s). Returning variants without ' 'modification, beginning with %s.', len(overlapping_variants), overlapping_variants[0]) for variant in overlapping_variants: yield variant return # Otherwise, the actual genotype calls are incompatible. Since the genotype # likelihoods are generally well-calibrated, we examine all configurations of # genotypes that create compatible haplotypes and retain the single # configuration with the highest joint likelihood across all variants as the # proposed genotype assignment. Separately, we rescale the likelihood of each # individual variant using only the valid genotype configurations. If the # results are concordant (i.e., the genotype predicted by the marginal # likelihood for each variant is the same as the genotype predicted when # maximizing the joint likelihood across all variants), we return variants # with those calls and the rescaled likelihoods. Otherwise, we log a warning # and emit the original (incompatible) variants. # # For example, a biallelic deletion with probabilities of homref, het, homalt # = 0.01, 0.9, 0.09 and inside it a biallelic SNP with probs 0.02, 0.48, 0.5. # Naively this would be called as a heterozygous indel and a homozygous SNP, # which is impossible as there are three total alternate genotypes. The # algorithm does the following: # # Indel SNP Joint prob # 0/0 0/0 0.01 * 0.02 = 0.0002 # 0/0 0/1 0.01 * 0.48 = 0.0048 # 0/0 1/1 0.01 * 0.50 = 0.0050 # 0/1 0/0 0.90 * 0.02 = 0.0180 # 0/1 0/1 0.90 * 0.48 = 0.4320* # 0/1 1/1 <invalid> = 0 # 1/1 0/0 0.09 * 0.02 = 0.0018 # 1/1 0/1 <invalid> = 0 # 1/1 1/1 <invalid> = 0 # # So using the highest joint likelihood, we predict het indel and het SNP. # # The marginal probability of each genotype for the indel is: # 0/0: 0.0002 + 0.0048 + 0.0050 = 0.01 # 0/1: 0.0180 + 0.4320 = 0.45 # 1/1: 0.0018 = 0.0018 # # which after normalizing to sum to 1 is roughly 0.022, 0.974, 0.004. # The marginal probability for the SNP, after performing similar # calculations, is 0.043, 0.946, 0.011. So the marginals also predict a het # indel and a het SNP. Since the two calculations agree, we use this # genotype call and modified likelihoods. # # First, we find all non-reference count configurations that are compatible. # This represents each variant solely based on its number of non-reference # genotypes, and assumes that variants are compatible if the total number of # non-reference genotypes at a single position is at most two. By using # non-reference counts, we avoid testing multiple allele configurations that # will return the same result (e.g. a variant with two possible alternate # alleles has three allele configurations that are homozygous alternate # [1/1, 1/2, 2/2] and either all or none of them will be valid depending on # the variants it interacts with). valid_nonref_count_configurations = [ conf for conf in itertools.product([0, 1, 2], repeat=len(overlapping_variants)) if calculator.all_variants_compatible(conf) ] # Next, we find the single compatible variant assignment with the individually # highest likelihood and track the total likelihood distributed to all variant # genotypes. likelihood_aggregators = [ _LikelihoodAggregator(len(v.alternate_bases)) for v in overlapping_variants ] most_likely_allele_indices_config = None most_likely_likelihood = None for nonref_count_config in valid_nonref_count_configurations: for allele_indices_config in _get_all_allele_indices_configurations( overlapping_variants, nonref_count_config): config_likelihood = _allele_indices_configuration_likelihood( overlapping_variants, allele_indices_config) if (most_likely_likelihood is None or config_likelihood > most_likely_likelihood): most_likely_likelihood = config_likelihood most_likely_allele_indices_config = allele_indices_config for aggregator, allele_indices in zip(likelihood_aggregators, allele_indices_config): aggregator.add(allele_indices, config_likelihood) marginal_allele_indices_config = tuple(agg.most_likely_allele_indices() for agg in likelihood_aggregators) if marginal_allele_indices_config == most_likely_allele_indices_config: logging.info( 'Overlapping variants are not naturally compatible, but the genotype ' 'configuration with the most likely joint likelihood is the same as ' 'that from the scaled marginal likelihoods: %s', overlapping_variants[0]) # Collapse the probabilities of all configurations to a single GL for each # allele, independently for each variant. scaled_gls = [ agg.scaled_likelihoods() for agg in likelihood_aggregators ] for variant, allele_indices, gls in zip( overlapping_variants, most_likely_allele_indices_config, scaled_gls): newvariant = variants_pb2.Variant() newvariant.CopyFrom(variant) newvariant.calls[0].genotype[:] = allele_indices newvariant.calls[0].genotype_likelihood[:] = gls yield newvariant else: logging.warning( 'Overlapping variants are not naturally compatible, and the genotype ' 'configuration with the most likely joint likelihood is different from ' 'that using the scaled marginal likelihoods: %s', overlapping_variants[0]) # redacted for variant in overlapping_variants: yield variant