def test_variants_overlap(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=20) with mock.patch.object(ranges, 'ranges_overlap') as mock_overlap: mock_overlap.return_value = 'SENTINEL' self.assertEqual(variant_utils.variants_overlap(v1, v2), 'SENTINEL') mock_overlap.assert_called_once_with( variant_utils.variant_range(v1), variant_utils.variant_range(v2))
def test_variants_overlap(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=20) with mock.patch.object(ranges, 'ranges_overlap') as mock_overlap: mock_overlap.return_value = 'SENTINEL' self.assertEqual(variant_utils.variants_overlap(v1, v2), 'SENTINEL') mock_overlap.assert_called_once_with( variant_utils.variant_range(v1), variant_utils.variant_range(v2))
def test_variant_position_and_range(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10) pos = ranges.make_range('1', 10, 11) range_ = ranges.make_range('1', 10, 14) v1_range_tuple = ('1', 10, 11) v2_range_tuple = ('1', 10, 14) self.assertEqual(pos, variant_utils.variant_position(v1)) self.assertEqual(pos, variant_utils.variant_position(v2)) self.assertEqual(pos, variant_utils.variant_range(v1)) self.assertEqual(range_, variant_utils.variant_range(v2)) self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1)) self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
def test_variant_position_and_range(self): v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10) v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10) pos = ranges.make_range('1', 10, 11) range_ = ranges.make_range('1', 10, 14) v1_range_tuple = ('1', 10, 11) v2_range_tuple = ('1', 10, 14) self.assertEqual(pos, variant_utils.variant_position(v1)) self.assertEqual(pos, variant_utils.variant_position(v2)) self.assertEqual(pos, variant_utils.variant_range(v1)) self.assertEqual(range_, variant_utils.variant_range(v2)) self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1)) self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
def _transform_call_variants_output_to_variants( input_sorted_tfrecord_path, qual_filter, multi_allelic_qual_filter, sample_name): """Yields Variant protos in sorted order from CallVariantsOutput protos. Variants present in the input TFRecord are converted to Variant protos, with the following filters applied: 1) variants are omitted if their quality is lower than the `qual_filter` threshold. 2) multi-allelic variants omit individual alleles whose qualities are lower than the `multi_allelic_qual_filter` threshold. Args: input_sorted_tfrecord_path: str. TFRecord format file containing sorted CallVariantsOutput protos. qual_filter: double. The qual value below which to filter variants. multi_allelic_qual_filter: double. The qual value below which to filter multi-allelic variants. sample_name: str. Sample name to write to VCF file. Yields: Variant protos in sorted order representing the CallVariantsOutput calls. """ for _, group in itertools.groupby( io_utils.read_tfrecords( input_sorted_tfrecord_path, proto=deepvariant_pb2.CallVariantsOutput), lambda x: variant_utils.variant_range(x.variant)): outputs = list(group) canonical_variant, predictions = merge_predictions( outputs, multi_allelic_qual_filter) variant = add_call_to_variant( canonical_variant, predictions, qual_filter=qual_filter, sample_name=sample_name) yield variant
def _transform_call_variants_output_to_variants(input_sorted_tfrecord_path, qual_filter, multi_allelic_qual_filter, sample_name): """Yields Variant protos in sorted order from CallVariantsOutput protos. Variants present in the input TFRecord are converted to Variant protos, with the following filters applied: 1) variants are omitted if their quality is lower than the `qual_filter` threshold. 2) multi-allelic variants omit individual alleles whose qualities are lower than the `multi_allelic_qual_filter` threshold. Args: input_sorted_tfrecord_path: str. TFRecord format file containing sorted CallVariantsOutput protos. qual_filter: double. The qual value below which to filter variants. multi_allelic_qual_filter: double. The qual value below which to filter multi-allelic variants. sample_name: str. Sample name to write to VCF file. Yields: Variant protos in sorted order representing the CallVariantsOutput calls. """ for _, group in itertools.groupby( io_utils.read_tfrecords(input_sorted_tfrecord_path, proto=deepvariant_pb2.CallVariantsOutput), lambda x: variant_utils.variant_range(x.variant)): outputs = _sort_grouped_variants(group) canonical_variant, predictions = merge_predictions( outputs, multi_allelic_qual_filter) variant = add_call_to_variant(canonical_variant, predictions, qual_filter=qual_filter, sample_name=sample_name) yield variant
def _transform_call_variants_output_to_variants(input_sorted_tfrecord_path, qual_filter, multi_allelic_qual_filter, sample_name, group_variants, use_multiallelic_model): """Yields Variant protos in sorted order from CallVariantsOutput protos. Variants present in the input TFRecord are converted to Variant protos, with the following filters applied: 1) variants are omitted if their quality is lower than the `qual_filter` threshold. 2) multi-allelic variants omit individual alleles whose qualities are lower than the `multi_allelic_qual_filter` threshold. Args: input_sorted_tfrecord_path: str. TFRecord format file containing sorted CallVariantsOutput protos. qual_filter: double. The qual value below which to filter variants. multi_allelic_qual_filter: double. The qual value below which to filter multi-allelic variants. sample_name: str. Sample name to write to VCF file. group_variants: bool. If true, group variants that have same start and end position. use_multiallelic_model: if True, use a specialized model for genotype resolution of multiallelic cases with two alts. Yields: Variant protos in sorted order representing the CallVariantsOutput calls. """ multiallelic_model = get_multiallelic_model( use_multiallelic_model=use_multiallelic_model) group_fn = None if group_variants: group_fn = lambda x: variant_utils.variant_range(x.variant) for _, group in itertools.groupby( tfrecord.read_tfrecords( input_sorted_tfrecord_path, proto=deepvariant_pb2.CallVariantsOutput), group_fn): outputs = _sort_grouped_variants(group) canonical_variant, predictions = merge_predictions( outputs, multi_allelic_qual_filter, multiallelic_model=multiallelic_model) variant = add_call_to_variant( canonical_variant, predictions, qual_filter=qual_filter, sample_name=sample_name) yield variant
def _label_grouped_variants(self, variants): # redacted # redacted # they should be computed in the grouping. span = ranges.span([variant_utils.variant_range(v) for v in variants]) truths = list( self._get_truth_variants( ranges.expand(span, _TRUTH_VARIANTS_QUERY_REGION_EXPANSION_IN_BP))) if len(truths) > self.max_group_size: logging.warning(( 'Found a large number of variants to label (n_candidates=%d, ' 'n_truth=%d) relative to candidate cap of %d. This may make the ' 'algorithm very slow.'), len(variants), len(truths), self.max_group_size) # redacted logging.warning( 'Returning all variants with not-confident markers.') for variant in variants: yield variant_labeler.VariantLabel(is_confident=False, genotype=(-1, -1), variant=variant) return ref = self.make_labeler_ref(variants, truths) labeled_variants = label_variants(variants, truths, ref) if not labeled_variants: raise ValueError('Failed to assign labels for variants', variants) else: for labeled in labeled_variants: yield variant_labeler.VariantLabel( # redacted # now. Rethink how we establish a variant is confident. Seems like # it'd be confident if it has a non-ref genotype (as we only # consider confident truth variants) or if it overlaps the confident # regions. is_confident=self._confident_regions.variant_overlaps( labeled), genotype=tuple(labeled.calls[0].genotype), variant=labeled)
def _label_grouped_variants(self, variants): # redacted # redacted # they should be computed in the grouping. span = ranges.span([variant_utils.variant_range(v) for v in variants]) truths = list( self._get_truth_variants( ranges.expand(span, _TRUTH_VARIANTS_QUERY_REGION_EXPANSION_IN_BP))) if len(truths) > self.max_group_size: logging.warning( ('Found a large number of variants to label (n_candidates=%d, ' 'n_truth=%d) relative to candidate cap of %d. This may make the ' 'algorithm very slow.'), len(variants), len(truths), self.max_group_size) # redacted logging.warning('Returning all variants with not-confident markers.') for variant in variants: yield variant_labeler.VariantLabel( is_confident=False, genotype=(-1, -1), variant=variant) return ref = self.make_labeler_ref(variants, truths) labeled_variants = label_variants(variants, truths, ref) if not labeled_variants: raise ValueError('Failed to assign labels for variants', variants) else: for labeled in labeled_variants: yield variant_labeler.VariantLabel( # redacted # now. Rethink how we establish a variant is confident. Seems like # it'd be confident if it has a non-ref genotype (as we only # consider confident truth variants) or if it overlaps the confident # regions. is_confident=self._confident_regions.variant_overlaps(labeled), genotype=tuple(labeled.calls[0].genotype), variant=labeled)
def query(self, region): return iter(variant for variant in self.variants if ranges.ranges_overlap( variant_utils.variant_range(variant), region))
def query(self, region): return iter( variant for variant in self.variants if ranges.ranges_overlap(variant_utils.variant_range(variant), region) )