def examples_to_variants(examples_path, max_records=None): """Yields Variant protos from the examples in examples_path. This function reads in tf.Examples produced by DeepVariant from examples_path, which may contain a sharded spec, sorts them, selects a representive example when there are multiple versions representing different alt_alleles, and yields the example_variant field from those examples. Args: examples_path: str. Path, or sharded spec, to labeled tf.Examples produced by DeepVariant in training mode. max_records: int or None. Maximum number of records to read, or None, to read all of the records. Yields: nucleus.protos.Variant protos in coordinate-sorted order. Raises: ValueError: if we find a Variant in any example that doesn't have genotypes. """ examples = io_utils.read_tfrecords(examples_path, max_records=max_records) variants = sorted( (tf_utils.example_variant(example) for example in examples), key=variant_utils.variant_range_tuple) for _, group in itertools.groupby(variants, variant_utils.variant_range_tuple): variant = next(group) if not variantcall_utils.has_genotypes(variant_utils.only_call(variant)): raise ValueError(( 'Variant {} does not have any genotypes. This tool only works with ' 'variants that have been labeled.').format( variant_utils.variant_key(variant))) yield variant
def examples_to_variants(examples_path, max_records=None): """Yields Variant protos from the examples in examples_path. This function reads in tf.Examples produced by DeepVariant from examples_path, which may contain a sharded spec, sorts them, selects a representive example when there are multiple versions representing different alt_alleles, and yields the example_variant field from those examples. Args: examples_path: str. Path, or sharded spec, to labeled tf.Examples produced by DeepVariant in training mode. max_records: int or None. Maximum number of records to read, or None, to read all of the records. Yields: nucleus.protos.Variant protos in coordinate-sorted order. Raises: ValueError: if we find a Variant in any example that doesn't have genotypes. """ examples = io_utils.read_tfrecords(examples_path, max_records=max_records) variants = sorted( (tf_utils.example_variant(example) for example in examples), key=variant_utils.variant_range_tuple) for _, group in itertools.groupby(variants, variant_utils.variant_range_tuple): variant = next(group) if not variantcall_utils.has_genotypes( variant_utils.only_call(variant)): raise ValueError(( 'Variant {} does not have any genotypes. This tool only works with ' 'variants that have been labeled.').format( variant_utils.variant_key(variant))) yield variant
def is_variant_call(variant, require_non_ref_genotype=True, no_calls_are_variant=False, call_indices=None): """Is variant a non-reference call? A Variant proto doesn't always imply that there's a variant present in the genome. The call may not have alternate bases, may be filtered, may a have hom-ref genotype, etc. This function looks for all of those configurations and returns true iff the variant is asserting that a mutation is present in the same. Note that this code allows a variant without a calls field to be variant, but one with a genotype call must have a non-reference genotype to be considered variant (if require_non_ref_genotype is True, the default). If False, a variant that passes all of the site-level requirements for being a variant_call will return a True value, regardless of the genotypes, which means that we'll consider a site with a sample with a hom-ref or no-call site a variant call. Args: variant: nucleus.genomics.v1.Variant. require_non_ref_genotype: Should we require a site with a genotype call to have a non-reference (het, hom-var) genotype for the site to be considered a variant call? no_calls_are_variant: If a site has genotypes, should we consider no_call genotypes as being variant or not? call_indices: A list of 0-based indices. If specified, only the calls at the given indices will be considered. The function will return True if any of those calls are variant. Returns: True if variant is really a mutation call. """ if not variant.alternate_bases: return False elif is_filtered(variant): return False elif not variant.calls or not require_non_ref_genotype: return True # All tests after this point should only look at genotype-based fields, as # we may have aborted out in the prev. line due to require_non_ref_genotype. else: if call_indices is None: call_indices = range(len(variant.calls)) return any( any(g > 0 for g in variant.calls[i].genotype) or ( no_calls_are_variant and not variantcall_utils.has_genotypes(variant.calls[i])) for i in call_indices)
def _genotype_from_matched_truth(candidate_variant, truth_variant): """Gets the diploid genotype for candidate_variant from matched truth_variant. This method figures out the genotype for candidate_variant by matching alleles in candidate_variant with those used by the genotype assigned to truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1 genotype, then this function would return (0, 1) indicating that there's one copy of the A allele and one of C in truth. If the true genotype is 1/1, then this routine would return (1, 1). The routine allows candidate_variant and truth_variant to differ in both the number of alternate alleles, and even in the representation of the same alleles due to those differences. For example, candidate could be: AGT/A/AGTGT => 2 bp deletion and 2 bp insertion and truth could have: A/AGT => just the simplified 2 bp insertion And this routine will correctly equate the AGT/AGTGT allele in candidate with the A/AGT in truth and use the number of copies of AGT in truth to compute the number of copies of AGTGT when determining the returned genotype. Args: candidate_variant: Our candidate third_party.nucleus.protos.Variant variant. truth_variant: Our third_party.nucleus.protos.Variant truth variant containing true alleles and genotypes. Returns: A tuple genotypes with the same semantics at the genotype field of the VariantCall proto. Raises: ValueError: If candidate_variant is None, truth_variant is None, or truth_variant doesn't have genotypes. """ if candidate_variant is None: raise ValueError('candidate_variant cannot be None') if truth_variant is None: raise ValueError('truth_variant cannot be None') if not variantcall_utils.has_genotypes( variant_utils.only_call(truth_variant)): raise ValueError( 'truth_variant needs genotypes to be used for labeling', truth_variant) def _match_one_allele(true_allele): if true_allele == truth_variant.reference_bases: return 0 else: simplified_true_allele = variant_utils.simplify_alleles( truth_variant.reference_bases, true_allele) for alt_index, alt_allele in enumerate( candidate_variant.alternate_bases): simplified_alt_allele = variant_utils.simplify_alleles( candidate_variant.reference_bases, alt_allele) if simplified_true_allele == simplified_alt_allele: return alt_index + 1 # If nothing matched, we don't have this alt, so the alt allele index for # should be 0 (i.e., not any alt). return 0 # If our candidate_variant is a reference call, return a (0, 0) genotype. if variant_utils.is_ref(candidate_variant): return (0, 0) else: return tuple( sorted( _match_one_allele(true_allele) for true_allele in variant_utils.genotype_as_alleles(truth_variant)))
def _genotype_from_matched_truth(candidate_variant, truth_variant): """Gets the diploid genotype for candidate_variant from matched truth_variant. This method figures out the genotype for candidate_variant by matching alleles in candidate_variant with those used by the genotype assigned to truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1 genotype, then this function would return (0, 1) indicating that there's one copy of the A allele and one of C in truth. If the true genotype is 1/1, then this routine would return (1, 1). The routine allows candidate_variant and truth_variant to differ in both the number of alternate alleles, and even in the representation of the same alleles due to those differences. For example, candidate could be: AGT/A/AGTGT => 2 bp deletion and 2 bp insertion and truth could have: A/AGT => just the simplified 2 bp insertion And this routine will correctly equate the AGT/AGTGT allele in candidate with the A/AGT in truth and use the number of copies of AGT in truth to compute the number of copies of AGTGT when determining the returned genotype. Args: candidate_variant: Our candidate third_party.nucleus.protos.Variant variant. truth_variant: Our third_party.nucleus.protos.Variant truth variant containing true alleles and genotypes. Returns: A tuple genotypes with the same semantics at the genotype field of the VariantCall proto. Raises: ValueError: If candidate_variant is None, truth_variant is None, or truth_variant doesn't have genotypes. """ if candidate_variant is None: raise ValueError('candidate_variant cannot be None') if truth_variant is None: raise ValueError('truth_variant cannot be None') if not variantcall_utils.has_genotypes( variant_utils.only_call(truth_variant)): raise ValueError('truth_variant needs genotypes to be used for labeling', truth_variant) def _match_one_allele(true_allele): if true_allele == truth_variant.reference_bases: return 0 else: simplifed_true_allele = variant_utils.simplify_alleles( truth_variant.reference_bases, true_allele) for alt_index, alt_allele in enumerate(candidate_variant.alternate_bases): simplifed_alt_allele = variant_utils.simplify_alleles( candidate_variant.reference_bases, alt_allele) if simplifed_true_allele == simplifed_alt_allele: return alt_index + 1 # If nothing matched, we don't have this alt, so the alt allele index for # should be 0 (i.e., not any alt). return 0 # If our candidate_variant is a reference call, return a (0, 0) genotype. if variant_utils.is_ref(candidate_variant): return (0, 0) else: return tuple( sorted( _match_one_allele(true_allele) for true_allele in variant_utils.genotype_as_alleles(truth_variant)))
def test_has_genotypes(self, genotype, expected): call = variants_pb2.VariantCall(genotype=genotype) actual = variantcall_utils.has_genotypes(call) self.assertEqual(actual, expected)
def test_has_genotypes(self, genotype, expected): call = variants_pb2.VariantCall(genotype=genotype) actual = variantcall_utils.has_genotypes(call) self.assertEqual(actual, expected)