def test_vcf_caller_end2end_outputs(self): # Confirming that the proposed VCF (input) has the same variants # as the VCF output converted from the output of make_examples. variants = list( labeled_examples_to_vcf.examples_to_variants( testdata.GOLDEN_VCF_CALLER_TRAINING_EXAMPLES)) with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader: # This checks the keys (like chr20:10099832:A->G) are the same. self.assertEqual([variant_utils.variant_key(v1) for v1 in variants], [ variant_utils.variant_key(v2) for v2 in proposed_vcf_reader.iterate() ]) with vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF) as proposed_vcf_reader: self.assertEqual( [variant_utils.genotype_as_alleles(v1) for v1 in variants], [ variant_utils.genotype_as_alleles( variant_utils.unphase_all_genotypes(v2)) for v2 in proposed_vcf_reader.iterate() ])
def test_genotype_as_alleles_raises_with_bad_inputs(self): with self.assertRaises(Exception): variant_utils.genotype_as_alleles(None) with self.assertRaises(Exception): variant_utils.genotype_as_alleles(test_utils.make_variant(gt=None)) with self.assertRaises(Exception): variant_utils.genotype_as_alleles( test_utils.make_variant(alleles=['A', 'C'], gt=[0, 0]), call_ix=1) with self.assertRaises(Exception): variant_utils.genotype_type(None)
def test_genotype_as_alleles_raises_with_bad_inputs(self): with self.assertRaises(Exception): variant_utils.genotype_as_alleles(None) with self.assertRaises(Exception): variant_utils.genotype_as_alleles(test_utils.make_variant(gt=None)) with self.assertRaises(Exception): variant_utils.genotype_as_alleles( test_utils.make_variant(alleles=['A', 'C'], gt=[0, 0]), call_ix=1) with self.assertRaises(Exception): variant_utils.genotype_type(None)
def test_genotype_as_alleles(self, variant, expected): self.assertEqual(variant_utils.genotype_as_alleles(variant), expected)
def test_genotype_as_alleles(self, variant, expected): self.assertEqual(variant_utils.genotype_as_alleles(variant), expected)
def _genotype_from_matched_truth(candidate_variant, truth_variant): """Gets the diploid genotype for candidate_variant from matched truth_variant. This method figures out the genotype for candidate_variant by matching alleles in candidate_variant with those used by the genotype assigned to truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1 genotype, then this function would return (0, 1) indicating that there's one copy of the A allele and one of C in truth. If the true genotype is 1/1, then this routine would return (1, 1). The routine allows candidate_variant and truth_variant to differ in both the number of alternate alleles, and even in the representation of the same alleles due to those differences. For example, candidate could be: AGT/A/AGTGT => 2 bp deletion and 2 bp insertion and truth could have: A/AGT => just the simplified 2 bp insertion And this routine will correctly equate the AGT/AGTGT allele in candidate with the A/AGT in truth and use the number of copies of AGT in truth to compute the number of copies of AGTGT when determining the returned genotype. Args: candidate_variant: Our candidate third_party.nucleus.protos.Variant variant. truth_variant: Our third_party.nucleus.protos.Variant truth variant containing true alleles and genotypes. Returns: A tuple genotypes with the same semantics at the genotype field of the VariantCall proto. Raises: ValueError: If candidate_variant is None, truth_variant is None, or truth_variant doesn't have genotypes. """ if candidate_variant is None: raise ValueError('candidate_variant cannot be None') if truth_variant is None: raise ValueError('truth_variant cannot be None') if not variantcall_utils.has_genotypes( variant_utils.only_call(truth_variant)): raise ValueError( 'truth_variant needs genotypes to be used for labeling', truth_variant) def _match_one_allele(true_allele): if true_allele == truth_variant.reference_bases: return 0 else: simplified_true_allele = variant_utils.simplify_alleles( truth_variant.reference_bases, true_allele) for alt_index, alt_allele in enumerate( candidate_variant.alternate_bases): simplified_alt_allele = variant_utils.simplify_alleles( candidate_variant.reference_bases, alt_allele) if simplified_true_allele == simplified_alt_allele: return alt_index + 1 # If nothing matched, we don't have this alt, so the alt allele index for # should be 0 (i.e., not any alt). return 0 # If our candidate_variant is a reference call, return a (0, 0) genotype. if variant_utils.is_ref(candidate_variant): return (0, 0) else: return tuple( sorted( _match_one_allele(true_allele) for true_allele in variant_utils.genotype_as_alleles(truth_variant)))
def _genotype_from_matched_truth(candidate_variant, truth_variant): """Gets the diploid genotype for candidate_variant from matched truth_variant. This method figures out the genotype for candidate_variant by matching alleles in candidate_variant with those used by the genotype assigned to truth_variant. For example, if candidate is A/C and truth is A/C with a 0/1 genotype, then this function would return (0, 1) indicating that there's one copy of the A allele and one of C in truth. If the true genotype is 1/1, then this routine would return (1, 1). The routine allows candidate_variant and truth_variant to differ in both the number of alternate alleles, and even in the representation of the same alleles due to those differences. For example, candidate could be: AGT/A/AGTGT => 2 bp deletion and 2 bp insertion and truth could have: A/AGT => just the simplified 2 bp insertion And this routine will correctly equate the AGT/AGTGT allele in candidate with the A/AGT in truth and use the number of copies of AGT in truth to compute the number of copies of AGTGT when determining the returned genotype. Args: candidate_variant: Our candidate third_party.nucleus.protos.Variant variant. truth_variant: Our third_party.nucleus.protos.Variant truth variant containing true alleles and genotypes. Returns: A tuple genotypes with the same semantics at the genotype field of the VariantCall proto. Raises: ValueError: If candidate_variant is None, truth_variant is None, or truth_variant doesn't have genotypes. """ if candidate_variant is None: raise ValueError('candidate_variant cannot be None') if truth_variant is None: raise ValueError('truth_variant cannot be None') if not variantcall_utils.has_genotypes( variant_utils.only_call(truth_variant)): raise ValueError('truth_variant needs genotypes to be used for labeling', truth_variant) def _match_one_allele(true_allele): if true_allele == truth_variant.reference_bases: return 0 else: simplifed_true_allele = variant_utils.simplify_alleles( truth_variant.reference_bases, true_allele) for alt_index, alt_allele in enumerate(candidate_variant.alternate_bases): simplifed_alt_allele = variant_utils.simplify_alleles( candidate_variant.reference_bases, alt_allele) if simplifed_true_allele == simplifed_alt_allele: return alt_index + 1 # If nothing matched, we don't have this alt, so the alt allele index for # should be 0 (i.e., not any alt). return 0 # If our candidate_variant is a reference call, return a (0, 0) genotype. if variant_utils.is_ref(candidate_variant): return (0, 0) else: return tuple( sorted( _match_one_allele(true_allele) for true_allele in variant_utils.genotype_as_alleles(truth_variant)))