Esempio n. 1
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
Esempio n. 2
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
Esempio n. 3
0
 def _range_tuples(variants):
   return [variant_utils.variant_range_tuple(v) for v in variants]
Esempio n. 4
0
 def _range_tuples(variants):
   return [variant_utils.variant_range_tuple(v) for v in variants]
def _example_sort_key(example):
    return variant_utils.variant_range_tuple(tf_utils.example_variant(example))
Esempio n. 6
0
  def test_make_examples_end2end(self, mode, num_shards,
                                 labeler_algorithm=None):
    self.maxDiff = None
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      make_examples.make_examples_runner(options)

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        io_utils.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants(
        [call.variant for call in candidates], region, options, is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(io_utils.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      self.assertItemsEqual(gvcfs, expected_gvcfs)
def _example_sort_key(example):
  return variant_utils.variant_range_tuple(tf_utils.example_variant(example))
Esempio n. 8
0
  def test_make_examples_end2end(self,
                                 mode,
                                 num_shards,
                                 test_condition=TestConditions.USE_BAM,
                                 labeler_algorithm=None,
                                 use_fast_pass_aligner=True):
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    if test_condition == TestConditions.USE_BAM:
      FLAGS.reads = testdata.CHR20_BAM
    elif test_condition == TestConditions.USE_CRAM:
      FLAGS.reads = testdata.CHR20_CRAM
    elif test_condition == TestConditions.USE_MULTI_BAMS:
      FLAGS.reads = ','.join(
          [testdata.CHR20_BAM_FIRST_HALF, testdata.CHR20_BAM_SECOND_HALF])

    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.use_fast_pass_aligner = use_fast_pass_aligner
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      # We need to overwrite bam_fname for USE_CRAM test since Golden Set
      # generated from BAM file. BAM filename is stored in candidates. If we
      # don't overwrite default_options variants won't match and test fail.
      options.bam_fname = 'NA12878_S1.chr20.10_10p1mb.bam'
      make_examples_core.make_examples_runner(options)

      # Check that our run_info proto contains the basic fields we'd expect:
      # (a) our options are written to the run_info.options field.
      run_info = make_examples_core.read_make_examples_run_info(
          options.run_info_filename)
      self.assertEqual(run_info.options, options)
      # (b) run_info.resource_metrics is present and contains our hostname.
      self.assertTrue(run_info.HasField('resource_metrics'))
      self.assertEqual(run_info.resource_metrics.host_name, platform.node())

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        tfrecord.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants([call.variant for call in candidates],
                         region,
                         options,
                         is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          tfrecord.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          tfrecord.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      # Despite the name, assertCountEqual checks that all elements match.
      self.assertCountEqual(gvcfs, expected_gvcfs)

    if (mode == 'training' and num_shards == 0 and
        labeler_algorithm != 'positional_labeler'):
      # The positional labeler doesn't track metrics, so don't try to read them
      # in when that's the mode.
      self.assertEqual(
          make_examples_core.read_make_examples_run_info(
              testdata.GOLDEN_MAKE_EXAMPLES_RUN_INFO).labeling_metrics,
          run_info.labeling_metrics)