Ejemplo n.º 1
0
    def testExampleSetTruthVariant(self):
        example = tf_utils.make_example(self.variant, self.alts,
                                        self.encoded_image, self.default_shape,
                                        self.default_format)
        full_tvariant = variants_pb2.Variant(
            variant_set_id='variant_set_id',
            id='id',
            names=['name1'],
            created=1234,
            reference_name='1',
            start=10,
            end=11,
            reference_bases='C',
            alternate_bases=['A'],
            filter=['PASS'],
            quality=1234.5,
            calls=[
                variants_pb2.VariantCall(call_set_id='call_set_id',
                                         call_set_name='call_set_name',
                                         genotype=[0, 1],
                                         phaseset='phaseset',
                                         genotype_likelihood=[0.1, 0.2, 0.3])
            ])
        test_utils.set_list_values(full_tvariant.info['key'], [1])
        test_utils.set_list_values(full_tvariant.calls[0].info['key'], [2])

        simple_tvariant = variants_pb2.Variant(
            reference_name='1',
            start=10,
            end=11,
            reference_bases='C',
            alternate_bases=['A'],
            filter=['PASS'],
            quality=1234.5,
            calls=[
                variants_pb2.VariantCall(call_set_name='call_set_name',
                                         genotype=[0, 1])
            ])
        test_utils.set_list_values(simple_tvariant.calls[0].info['key'], [2])

        self.assertIsNotAFeature('truth_variant/encoded', example)
        tf_utils.example_set_truth_variant(example,
                                           full_tvariant,
                                           simplify=False)
        self.assertEqual(full_tvariant,
                         tf_utils.example_truth_variant(example))

        # Check that reencoding with simplify=True produces the simplified version.
        tf_utils.example_set_truth_variant(example,
                                           full_tvariant,
                                           simplify=True)
        self.assertEqual(simple_tvariant,
                         tf_utils.example_truth_variant(example))
    def test_label_variant(self):
        variant = test_utils.make_variant(start=10, alleles=['A', 'C'])
        tvariant = test_utils.make_variant(start=10,
                                           alleles=['A', 'C'],
                                           gt=[0, 1])
        example = tf_utils.make_example(variant, ['C'], 'foo',
                                        self.default_shape,
                                        self.default_format)
        labeler = mock.Mock()
        labeler.match = mock.Mock(return_value=[True, tvariant])
        labeler.match_to_alt_count = mock.Mock(return_value=1)
        self.processor.labeler = labeler

        labeled = example_pb2.Example()
        labeled.CopyFrom(example)
        self.processor.label_variant(labeled, variant)

        labeler.match.assert_called_once_with(variant)
        labeler.match_to_alt_count.assert_called_once_with(
            variant, tvariant, ['C'])

        for key, value in example.features.feature.iteritems():
            self.assertEqual(value, labeled.features.feature[key])
        self.assertEqual(1, tf_utils.example_label(labeled))
        self.assertEqual(tvariant, tf_utils.example_truth_variant(labeled))
    def verify_examples(self, examples_filename, region, options,
                        verify_labels):
        # Do some simple structural checks on the tf.Examples in the file.
        expected_labels = [
            'variant/encoded', 'locus', 'image/format', 'image/encoded',
            'alt_allele_indices/encoded'
        ]
        if verify_labels:
            expected_labels += ['label', 'truth_variant/encoded']

        examples = list(io_utils.read_tfrecords(examples_filename))
        for example in examples:
            for label_feature in expected_labels:
                self.assertIn(label_feature, example.features.feature)
            # pylint: disable=g-explicit-length-test
            self.assertGreater(
                len(tf_utils.example_alt_alleles_indices(example)), 0)

            if verify_labels:
                # Check that our variant and our truth_variant both have the same start.
                self.assertEqual(
                    variantutils.variant_position(
                        tf_utils.example_variant(example)),
                    variantutils.variant_position(
                        tf_utils.example_truth_variant(example)))

        # Check that the variants in the examples are good.
        variants = [tf_utils.example_variant(x) for x in examples]
        self.verify_variants(variants, region, options, is_gvcf=False)

        return examples
Ejemplo n.º 4
0
def make_examples_runner(options):
    """Runs examples creation stage of deepvariant."""
    # Counting variants.
    counters = make_counters()

    logging.info('Preparing inputs')
    regions = processing_regions_from_options(options)

    # Create a processor to create candidates and examples for each region.
    region_processor = RegionProcessor(options)

    logging.info('Writing examples to %s', options.examples_filename)
    if options.candidates_filename:
        logging.info('Writing candidates to %s', options.candidates_filename)
    if options.gvcf_filename:
        logging.info('Writing gvcf records to %s', options.gvcf_filename)

    n_regions, n_candidates = 0, 0
    with io_utils.OutputsWriter(options) as writer:
        for region in regions:
            candidates, examples, gvcfs = region_processor.process(region)
            n_candidates += len(candidates)
            n_regions += 1

            writer.write('candidates', *candidates)

            # If we have any gvcf records, write them out. This if also serves to
            # protect us from trying to write to the gvcfs output of writer when gvcf
            # generation is turned off. In that case, gvcfs will always be empty and
            # we'll never execute the write.
            if gvcfs:
                writer.write('gvcfs', *gvcfs)

            for example in examples:
                if in_training_mode(options):
                    truth_variant = tf_utils.example_truth_variant(example)
                    counters.update(truth_variant)
                writer.write('examples', example)

    logging.info('Found %s candidate variants', n_candidates)
    if in_training_mode(options):
        # This printout is misleading if we are in calling mode.
        counters.log()