Exemple #1
0
 def testGetNoneShapeFromEmptyExamplesPath(self, file_name_to_write,
                                           tfrecord_path_to_match):
     output_file = test_utils.test_tmpfile(file_name_to_write)
     io_utils.write_tfrecords([], output_file)
     self.assertIsNone(
         tf_utils.get_shape_from_examples_path(
             test_utils.test_tmpfile(tfrecord_path_to_match)))
Exemple #2
0
  def test_make_examples_runtime_by_region(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.mode = 'calling'
    num_shards = 4
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    # Use same number of shards for profiling files as examples.
    output_prefix = test_utils.test_tmpfile('runtime_profile')
    FLAGS.runtime_by_region = output_prefix + '@{}'.format(num_shards)
    FLAGS.task = 2
    # Run make_examples with those FLAGS.
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Sharded output ending in @4 becomes -00002-of-00004 for task 2.
    expected_output_path = output_prefix + '-0000{}-of-00004'.format(FLAGS.task)
    expected_columns = [
        'region', 'get reads', 'find candidates', 'make pileup images',
        'write outputs', 'num reads', 'num candidates', 'num examples'
    ]

    with gfile.Open(expected_output_path, 'r') as fin:
      header = fin.readline()
      column_names = header.strip().split('\t')
      self.assertEqual(expected_columns, column_names)
      non_header_lines = fin.readlines()
      self.assertLen(non_header_lines, 3)
      one_row = non_header_lines[0].strip().split('\t')
      self.assertEqual(len(one_row), len(column_names))
      self.assertGreater(int(one_row[5]), 0, msg='num reads > 0')
      self.assertGreater(int(one_row[6]), 0, msg='num candidates > 0')
      self.assertGreater(int(one_row[7]), 0, msg='num examples > 0')
    def test_conversion_to_tfrecord_and_back(self, original_input_file):
        """Test conversion from a native file format to tfrecord.gz, then back."""
        input_path = test_utils.genomics_core_testdata(original_input_file)
        tfrecord_output_path = test_utils.test_tmpfile(original_input_file +
                                                       ".tfrecord.gz")
        native_output_path = test_utils.test_tmpfile(original_input_file)

        # Test conversion from native format to tfrecord.
        self._convert(input_path, tfrecord_output_path)

        # redacted
        if native_output_path.endswith(".sam"):
            raise unittest.SkipTest("SAM writing not yet supported")

        # Test conversion from tfrecord format back to native format.  Ensure that
        # conversions where we would need a header, but don't have one from the
        # input, trigger an error message.
        if any(
                native_output_path.endswith(ext)
                for ext in FORMATS_REQUIRING_HEADER):
            with self.assertRaisesRegexp(
                    converter.ConversionError,
                    "Input file does not have a header, which is needed to construct "
                    "output file"):
                self._convert(tfrecord_output_path, native_output_path)

        else:
            self._convert(tfrecord_output_path, native_output_path)
Exemple #4
0
    def test_prepare_inputs(self, filename_to_write, file_string_input):
        source_path = test_utils.test_tmpfile(filename_to_write)
        io_utils.write_tfrecords(self.examples, source_path)
        # file_string_input could be a comma-separated list. Add the prefix to all
        # of them, and join it back to a string.
        file_string_input = ','.join(
            [test_utils.test_tmpfile(f) for f in file_string_input.split(',')])

        with self.test_session() as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            ds = call_variants.prepare_inputs(file_string_input)
            _, variants, _ = data_providers.get_infer_batches(ds,
                                                              model=self.model,
                                                              batch_size=1)

            seen_variants = []
            try:
                while True:
                    seen_variants.extend(sess.run(variants))
            except tf.errors.OutOfRangeError:
                pass

            self.assertItemsEqual(self.variants,
                                  variant_utils.decode_variants(seen_variants))
Exemple #5
0
  def test_make_examples_training_end2end_with_alt_aligned_pileup(
      self, alt_align, expected_shape):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.gvcf_gq_binsize = 5
    FLAGS.alt_aligned_pileup = alt_align  # This is the only input change.
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Check the output for shape and against the golden file.
    if alt_align == 'rows':
      golden_file = _sharded(testdata.ALT_ALIGNED_ROWS_EXAMPLES)
    elif alt_align == 'diff_channels':
      golden_file = _sharded(testdata.ALT_ALIGNED_DIFF_CHANNELS_EXAMPLES)
    else:
      raise ValueError("Golden data doesn't exist for this alt_align option: "
                       '{}'.format(alt_align))
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=True)
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    # Pileup image should have 3 rows of height 100, so resulting height is 300.
    self.assertEqual(decode_example(examples[0])['image/shape'], expected_shape)
Exemple #6
0
    def _get_examples(use_confident_regions=False):
      # `flag_name` can be either 'confident_regions' or 'regions'. Both should
      # be used to constrain the set of candidates generated, and as a result
      # generating the same examples.
      bed_path = test_utils.test_tmpfile('vcf_candidate_importer.bed')
      with gfile.Open(bed_path, 'w') as fout:
        fout.write('\t'.join(['chr20', '10000000', '10001000']) + '\n')
      if use_confident_regions:
        FLAGS.confident_regions = bed_path
        FLAGS.regions = ''
      else:
        FLAGS.confident_regions = ''
        FLAGS.regions = bed_path

      FLAGS.examples = test_utils.test_tmpfile(
          _sharded('vcf_candidate_importer.tfrecord'))
      FLAGS.mode = 'training'
      FLAGS.reads = testdata.CHR20_BAM
      FLAGS.ref = testdata.CHR20_FASTA
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.variant_caller = 'vcf_candidate_importer'

      options = make_examples.default_options(add_flags=True)
      make_examples_core.make_examples_runner(options)
      # Verify that the variants in the examples are all good.
      examples = self.verify_examples(
          FLAGS.examples, None, options, verify_labels=False)
      return examples
Exemple #7
0
  def test_make_examples_end2end_vcf_candidate_importer(self, mode):
    FLAGS.variant_caller = 'vcf_candidate_importer'
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vcf_candidate_importer.{}.tfrecord'.format(mode)))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('vcf_candidate_importer.examples.{}.tfrecord'.format(mode)))
    FLAGS.mode = mode

    if mode == 'calling':
      golden_file = _sharded(
          testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_CALLING_EXAMPLES)
      FLAGS.proposed_variants = testdata.VCF_CANDIDATE_IMPORTER_VARIANTS
      # Adding the following flags to match how the testdata was created.
      FLAGS.regions = 'chr20:59,777,000-60,000,000'
      FLAGS.realign_reads = False
    else:
      golden_file = _sharded(
          testdata.GOLDEN_VCF_CANDIDATE_IMPORTER_TRAINING_EXAMPLES)
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)
    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, None, options, verify_labels=mode == 'training')
    self.assertDeepVariantExamplesEqual(
        examples, list(tfrecord.read_tfrecords(golden_file)))
    self.assertEqual(
        decode_example(examples[0])['image/shape'],
        [100, 221, dv_constants.PILEUP_NUM_CHANNELS])
Exemple #8
0
 def testGetShapeFromExamplesPath(self, file_name_to_write,
                                  tfrecord_path_to_match):
   example = example_pb2.Example()
   valid_shape = [1, 2, 3]
   example.features.feature['image/shape'].int64_list.value.extend(valid_shape)
   output_file = test_utils.test_tmpfile(file_name_to_write)
   io_utils.write_tfrecords([example], output_file)
   tf_utils.get_shape_from_examples_path(
       test_utils.test_tmpfile(tfrecord_path_to_match))
Exemple #9
0
 def testGlobListShardedFilePatterns(self, specs, expected_files):
     # First, create all expected_files so Glob will work later.
     expected_full_files = [
         test_utils.test_tmpfile(f, '') for f in expected_files
     ]
     # Create the full spec names. This one doesn't create the files.
     full_specs = ','.join(
         [test_utils.test_tmpfile(spec) for spec in specs.split(',')])
     self.assertEqual(sorted(set(expected_full_files)),
                      io.glob_list_sharded_file_patterns(full_specs))
Exemple #10
0
 def test_call_end2end_empty_first_shard(self):
   # Get only up to 10 examples.
   examples = list(
       io_utils.read_tfrecords(
           testdata.GOLDEN_CALLING_EXAMPLES, max_records=10))
   empty_first_file = test_utils.test_tmpfile('empty_1st_shard-00000-of-00002')
   io_utils.write_tfrecords([], empty_first_file)
   second_file = test_utils.test_tmpfile('empty_1st_shard-00001-of-00002')
   io_utils.write_tfrecords(examples, second_file)
   self.assertCallVariantsEmitsNRecordsForRandomGuess(
       test_utils.test_tmpfile('empty_1st_shard@2'), len(examples))
 def test_call_end2end_empty_first_shard(self):
   # Get only up to 10 examples.
   examples = list(
       io_utils.read_tfrecords(
           testdata.GOLDEN_CALLING_EXAMPLES, max_records=10))
   empty_first_file = test_utils.test_tmpfile('empty_1st_shard-00000-of-00002')
   io_utils.write_tfrecords([], empty_first_file)
   second_file = test_utils.test_tmpfile('empty_1st_shard-00001-of-00002')
   io_utils.write_tfrecords(examples, second_file)
   self.assertCallVariantsEmitsNRecordsForRandomGuess(
       test_utils.test_tmpfile('empty_1st_shard@2'), len(examples))
 def test_get_shape_from_examples_path(self, file_name_to_write,
                                       tfrecord_path_to_match):
   example = example_pb2.Example()
   valid_shape = [1, 2, 3]
   example.features.feature['image/shape'].int64_list.value.extend(valid_shape)
   output_file = test_utils.test_tmpfile(file_name_to_write)
   io_utils.write_tfrecords([example], output_file)
   ds = data_providers.DeepVariantDataSet(
       name='test_shape',
       source=test_utils.test_tmpfile(tfrecord_path_to_match),
       num_examples=1)
   self.assertEqual(valid_shape, ds.tensor_shape)
Exemple #13
0
  def test_reading_empty_input_should_raise_error(self):
    empty_shard_one = test_utils.test_tmpfile(
        'no_records.tfrecord-00000-of-00002')
    empty_shard_two = test_utils.test_tmpfile(
        'no_records.tfrecord-00001-of-00002')
    io_utils.write_tfrecords([], empty_shard_one)
    io_utils.write_tfrecords([], empty_shard_two)
    FLAGS.infile = test_utils.test_tmpfile('no_records.tfrecord@2')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.outfile = test_utils.test_tmpfile('no_records.vcf')

    with self.assertRaisesRegexp(ValueError, 'Cannot find any records in'):
      postprocess_variants.main(['postprocess_variants.py'])
 def test_get_shape_from_examples_path(self, file_name_to_write,
                                       tfrecord_path_to_match):
     example = example_pb2.Example()
     valid_shape = [1, 2, 3]
     example.features.feature['image/shape'].int64_list.value.extend(
         valid_shape)
     output_file = test_utils.test_tmpfile(file_name_to_write)
     io_utils.write_tfrecords([example], output_file)
     ds = data_providers.DeepVariantDataSet(
         name='test_shape',
         source=test_utils.test_tmpfile(tfrecord_path_to_match),
         num_examples=1)
     self.assertEqual(valid_shape, ds.tensor_shape)
 def test_get_shape_from_examples_path(self, file_name_to_write,
                                       tfrecord_path_to_match):
   example = example_pb2.Example()
   valid_shape = [1, 2, 3]
   example.features.feature['image/shape'].int64_list.value.extend(valid_shape)
   output_file = test_utils.test_tmpfile(file_name_to_write)
   tfrecord.write_tfrecords([example], output_file)
   ds = data_providers.DeepVariantInput(
       mode=tf.estimator.ModeKeys.PREDICT,
       name='test_shape',
       input_file_spec=test_utils.test_tmpfile(tfrecord_path_to_match),
       num_examples=1)
   self.assertEqual(valid_shape, ds.tensor_shape)
Exemple #16
0
  def test_reading_sharded_input_with_empty_shards_does_not_crash(self):
    valid_variants = io_utils.read_tfrecords(
        testdata.GOLDEN_POSTPROCESS_INPUT,
        proto=deepvariant_pb2.CallVariantsOutput)
    empty_shard_one = test_utils.test_tmpfile(
        'reading_empty_shard.tfrecord-00000-of-00002')
    none_empty_shard_two = test_utils.test_tmpfile(
        'reading_empty_shard.tfrecord-00001-of-00002')
    io_utils.write_tfrecords([], empty_shard_one)
    io_utils.write_tfrecords(valid_variants, none_empty_shard_two)
    FLAGS.infile = test_utils.test_tmpfile('reading_empty_shard.tfrecord@2')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.outfile = test_utils.test_tmpfile('calls_reading_empty_shard.vcf')

    postprocess_variants.main(['postprocess_variants.py'])
def _test_dataset_config(filename, **kwargs):
    """Creates a DeepVariantDatasetConfig(**kwargs) and writes it to filename."""
    dataset_config_pbtext_filename = test_utils.test_tmpfile(filename)
    dataset_config = deepvariant_pb2.DeepVariantDatasetConfig(**kwargs)
    data_providers.write_dataset_config_to_pbtxt(
        dataset_config, dataset_config_pbtext_filename)
    return dataset_config_pbtext_filename
  def test_call_end2end(self, compressed_inputs):
    FLAGS.infile = make_golden_dataset(compressed_inputs)
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.outfile = test_utils.test_tmpfile('calls.vcf')
    FLAGS.nonvariant_site_tfrecord_path = (
        testdata.GOLDEN_POSTPROCESS_GVCF_INPUT)
    FLAGS.gvcf_outfile = test_utils.test_tmpfile('gvcf_calls.vcf')

    postprocess_variants.main(['postprocess_variants.py'])

    self.assertEqual(
        tf.gfile.FastGFile(FLAGS.outfile).readlines(),
        tf.gfile.FastGFile(testdata.GOLDEN_POSTPROCESS_OUTPUT).readlines())
    self.assertEqual(
        tf.gfile.FastGFile(FLAGS.gvcf_outfile).readlines(),
        tf.gfile.FastGFile(testdata.GOLDEN_POSTPROCESS_GVCF_OUTPUT).readlines())
 def setUp(self):
   super(TabixTest, self).setUp()
   self.input_file = test_utils.genomics_core_testdata('test_samples.vcf.gz')
   self.output_file = test_utils.test_tmpfile('test_samples.vcf.gz')
   shutil.copyfile(self.input_file, self.output_file)
   self.tbx_index_file = self.output_file + '.tbi'
   self.csi_index_file = self.output_file + '.csi'
Exemple #20
0
  def test_roundtrip(self,
                     expected_infos,
                     expected_fmt,
                     expected_fmt1,
                     expected_fmt2,
                     reader_excluded_info=None,
                     reader_excluded_format=None,
                     writer_excluded_info=None,
                     writer_excluded_format=None):
    expected_records = [
        record.format(info=info, fmt=expected_fmt, efmts1=e1,
                      efmts2=e2) for record, info, e1, e2 in zip(
                          self.record_format_strings, expected_infos,
                          expected_fmt1, expected_fmt2)
    ]
    expected = self.header + ''.join(expected_records)
    with vcf.VcfReader(
        test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
        use_index=False,
        excluded_info_fields=reader_excluded_info,
        excluded_format_fields=reader_excluded_format) as reader:

      records = list(reader.iterate())
      output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
      with vcf.VcfWriter(
          output_path,
          header=reader.header,
          excluded_info_fields=writer_excluded_info,
          excluded_format_fields=writer_excluded_format) as writer:
        for record in records:
          writer.write(record)

    with open(output_path) as f:
      actual = f.read()
    self.assertEqual(actual, expected)
    def test_prepare_inputs(self, filename, expand_to_file_pattern):
        source_path = test_utils.test_tmpfile(filename)
        io_utils.write_tfrecords(self.examples, source_path)
        if expand_to_file_pattern:
            # Transform foo@3 to foo-?????-of-00003.
            source_path = io_utils.NormalizeToShardedFilePattern(source_path)

        with self.test_session() as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            ds = call_variants.prepare_inputs(source_path)
            _, variants, _ = data_providers.get_infer_batches(ds,
                                                              model=self.model,
                                                              batch_size=1)

            seen_variants = []
            try:
                while True:
                    seen_variants.extend(sess.run(variants))
            except tf.errors.OutOfRangeError:
                pass

            self.assertItemsEqual(self.variants,
                                  variant_utils.decode_variants(seen_variants))
def _test_dataset_config(filename, **kwargs):
  """Creates a DeepVariantDatasetConfig(**kwargs) and writes it to filename."""
  dataset_config_pbtext_filename = test_utils.test_tmpfile(filename)
  dataset_config = deepvariant_pb2.DeepVariantDatasetConfig(**kwargs)
  data_providers.write_dataset_config_to_pbtxt(dataset_config,
                                               dataset_config_pbtext_filename)
  return dataset_config_pbtext_filename
  def test_round_trip_vcf(self, test_datum_name):
    # Round-trip variants through writing and reading:
    # 1. Read variants v1 from VcfReader;
    # 2. Write v1 to vcf using our VcfWriter;
    # 3. Read back in using VcfReader -- v2;
    # 4. compare v1 and v2.
    in_file = test_utils.genomics_core_testdata(test_datum_name)
    out_file = test_utils.test_tmpfile('output_' + test_datum_name)

    v1_reader = vcf.VcfReader(in_file, use_index=False)
    v1_records = list(v1_reader.iterate())
    self.assertTrue(v1_records, 'Reader failed to find records')

    header = copy.deepcopy(v1_reader.header)
    writer_options = variants_pb2.VcfWriterOptions()

    with vcf_writer.VcfWriter.to_file(out_file, header,
                                      writer_options) as writer:
      for record in v1_records:
        writer.write(record)

    v2_reader = vcf.VcfReader(out_file, use_index=False)
    v2_records = list(v2_reader.iterate())

    self.assertEqual(v1_records, v2_records,
                     'Round-tripped variants not as expected')
Exemple #24
0
    def testModelShapes(self):
        # Builds a graph.
        v0 = tf.Variable([[1, 2, 3], [4, 5, 6]], dtype=tf.float32, name='v0')
        v1 = tf.Variable([[[1], [2]], [[3], [4]], [[5], [6]]],
                         dtype=tf.float32,
                         name='v1')
        init_all_op = tf.initialize_all_variables()
        save = tf.train.Saver({'v0': v0, 'v1': v1})
        save_path = test_utils.test_tmpfile('ckpt_for_debug_string')
        with tf.Session() as sess:
            sess.run(init_all_op)
            # Saves a checkpoint.
            save.save(sess, save_path)

            # Model shapes without any variable requests gives you all variables.
            self.assertEqual({
                'v0': (2, 3),
                'v1': (3, 2, 1)
            }, tf_utils.model_shapes(save_path))
            # Asking for v0 gives you only v0's shape.
            self.assertEqual({'v0': (2, 3)},
                             tf_utils.model_shapes(save_path, ['v0']))
            # Asking for v1 gives you only v1's shape.
            self.assertEqual({'v1': (3, 2, 1)},
                             tf_utils.model_shapes(save_path, ['v1']))

            # Verifies model_shapes() fails for non-existent tensors.
            with self.assertRaisesRegexp(KeyError, 'v3'):
                tf_utils.model_shapes(save_path, ['v3'])
 def test_call_variants_runs_on_gpus(self, model):
     call_variants.call_variants(
         examples_filename=testdata.GOLDEN_CALLING_EXAMPLES,
         checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
         model=model,
         execution_hardware='accelerator',
         output_file=test_utils.test_tmpfile('zzz.tfrecord'))
Exemple #26
0
    def testModelNumClasses(self):
        # Builds a graph.
        class_variable_name = 'class_variable_name'
        v0 = tf.Variable([[1, 2, 3]],
                         dtype=tf.int32,
                         name='class_variable_name')
        v1 = tf.Variable([[[1], [2]], [[3], [4]], [[5], [6]]],
                         dtype=tf.float32,
                         name='v1')
        init_all_op = tf.initialize_all_variables()
        save = tf.train.Saver({class_variable_name: v0, 'v1': v1})
        save_path = test_utils.test_tmpfile('ckpt_for_debug_classes')
        with tf.Session() as sess:
            sess.run(init_all_op)
            # Saves a checkpoint.
            save.save(sess, save_path)

            # If you pass in the correct class_variable_name, you'll find the number
            # of classes.
            self.assertEqual(
                3, tf_utils.model_num_classes(save_path, class_variable_name))
            # If the class variable name doesn't existin the checkpoint, return None.
            self.assertEqual(
                None, tf_utils.model_num_classes(save_path,
                                                 'non-existent-var'))
            # If the checkpoint doesn't exist, return none.
            self.assertEqual(
                None, tf_utils.model_num_classes(None, class_variable_name))
 def setUp(self):
   self.out_fname = test_utils.test_tmpfile('output.vcf')
   self.header = variants_pb2.VcfHeader(
       contigs=[
           reference_pb2.ContigInfo(name='Chr1', n_bases=50, pos_in_fasta=0),
           reference_pb2.ContigInfo(name='Chr2', n_bases=25, pos_in_fasta=1),
       ],
       sample_names=['Fido', 'Spot'],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='GT', number='1', type='String', description='Genotype'),
           variants_pb2.VcfFormatInfo(
               id='GQ',
               number='1',
               type='Float',
               description='Genotype Quality')
       ],
   )
   self.options = variants_pb2.VcfWriterOptions()
   self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header,
                                              self.options)
   self.variant = test_utils.make_variant(
       chrom='Chr1',
       start=10,
       alleles=['A', 'C'],
   )
   self.variant.calls.extend([
       variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'),
       variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'),
   ])
 def test_call_variants_with_empty_input(self):
     source_path = test_utils.test_tmpfile('empty.tfrecord')
     io_utils.write_tfrecords([], source_path)
     # Make sure that prepare_inputs don't crash on empty input.
     call_variants.prepare_inputs(source_path,
                                  modeling.get_model('random_guess'),
                                  batch_size=1)
 def setUp(self):
     self.out_fname = test_utils.test_tmpfile('output.vcf')
     self.header = variants_pb2.VcfHeader(
         contigs=[
             reference_pb2.ContigInfo(name='Chr1',
                                      n_bases=50,
                                      pos_in_fasta=0),
             reference_pb2.ContigInfo(name='Chr2',
                                      n_bases=25,
                                      pos_in_fasta=1),
         ],
         sample_names=['Fido', 'Spot'],
         formats=[
             variants_pb2.VcfFormatInfo(id='GT',
                                        number='1',
                                        type='String',
                                        description='Genotype'),
             variants_pb2.VcfFormatInfo(id='GQ',
                                        number='1',
                                        type='Float',
                                        description='Genotype Quality')
         ],
     )
     self.options = variants_pb2.VcfWriterOptions()
     self.writer = vcf_writer.VcfWriter.to_file(self.out_fname, self.header,
                                                self.options)
     self.variant = test_utils.make_variant(
         chrom='Chr1',
         start=10,
         alleles=['A', 'C'],
     )
     self.variant.calls.extend([
         variants_pb2.VariantCall(genotype=[0, 0], call_set_name='Fido'),
         variants_pb2.VariantCall(genotype=[0, 1], call_set_name='Spot'),
     ])
    def test_round_trip_vcf(self, test_datum_name):
        # Round-trip variants through writing and reading:
        # 1. Read variants v1 from VcfReader;
        # 2. Write v1 to vcf using our VcfWriter;
        # 3. Read back in using VcfReader -- v2;
        # 4. compare v1 and v2.
        in_file = test_utils.genomics_core_testdata(test_datum_name)
        out_file = test_utils.test_tmpfile('output_' + test_datum_name)

        v1_reader = vcf.VcfReader(in_file)
        v1_records = list(v1_reader.iterate())
        self.assertTrue(v1_records, 'Reader failed to find records')

        header = copy.deepcopy(v1_reader.header)
        writer_options = variants_pb2.VcfWriterOptions()

        with vcf_writer.VcfWriter.to_file(out_file, header,
                                          writer_options) as writer:
            for record in v1_records:
                writer.write(record)

        v2_reader = vcf.VcfReader(out_file)
        v2_records = list(v2_reader.iterate())

        self.assertEqual(v1_records, v2_records,
                         'Round-tripped variants not as expected')
Exemple #31
0
    def test_roundtrip(self,
                       expected_infos,
                       expected_fmt,
                       expected_fmt1,
                       expected_fmt2,
                       reader_excluded_info=None,
                       reader_excluded_format=None,
                       writer_excluded_info=None,
                       writer_excluded_format=None):
        expected_records = [
            record.format(info=info, fmt=expected_fmt, efmts1=e1,
                          efmts2=e2) for record, info, e1, e2 in zip(
                              self.record_format_strings, expected_infos,
                              expected_fmt1, expected_fmt2)
        ]
        expected = self.header + ''.join(expected_records)
        with vcf.VcfReader(
                test_utils.genomics_core_testdata('test_py_roundtrip.vcf'),
                excluded_info_fields=reader_excluded_info,
                excluded_format_fields=reader_excluded_format) as reader:

            records = list(reader.iterate())
            output_path = test_utils.test_tmpfile('test_roundtrip_tmpfile.vcf')
            with vcf.VcfWriter(
                    output_path,
                    header=reader.header,
                    excluded_info_fields=writer_excluded_info,
                    excluded_format_fields=writer_excluded_format) as writer:
                for record in records:
                    writer.write(record)

        with open(output_path) as f:
            actual = f.read()
        self.assertEqual(actual, expected)
 def test_call_variants_runs_on_gpus(self, model):
   call_variants.call_variants(
       examples_filename=testdata.GOLDEN_CALLING_EXAMPLES,
       checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
       model=model,
       execution_hardware='accelerator',
       output_file=test_utils.test_tmpfile('zzz.tfrecord'))
 def setUp(self):
     writer_options = fastq_pb2.FastqWriterOptions()
     out_fname = test_utils.test_tmpfile('output.fastq')
     self.writer = fastq_writer.FastqWriter.to_file(out_fname,
                                                    writer_options)
     self.expected_fastq_content = [
         '@NODESC:header\n',
         'GATTACA\n',
         '+\n',
         'BB>B@FA\n',
         '@M01321:49:000000000-A6HWP:1:1101:17009:2216 1:N:0:1\n',
         'CGTTAGCGCAGGGGGCATCTTCACACTGGTGACAGGTAACCGCCGTAGTAAAGGTTCCGCCTTTCACT\n',
         '+\n',
         'AAAAABF@BBBDGGGG?FFGFGHBFBFBFABBBHGGGFHHCEFGGGGG?FGFFHEDG3EFGGGHEGHG\n',
         '@FASTQ contains multiple spaces in description\n',
         'CGGCTGGTCAGGCTGACATCGCCGCCGGCCTGCAGCGAGCCGCTGC\n',
         '+\n',
         'FAFAF;F/9;.:/;999B/9A.DFFF;-->.AAB/FC;9-@-=;=.\n',
         '@FASTQ_with_trailing_space\n',
         'CGG\n',
         '+\n',
         'FAD\n',
     ]
     self.record = fastq_pb2.FastqRecord(id='ID',
                                         description='desc',
                                         sequence='ACGTAC',
                                         quality='ABCDEF')
Exemple #34
0
  def test_call_end2end(self, compressed_inputs):
    FLAGS.infile = make_golden_dataset(compressed_inputs)
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.outfile = test_utils.test_tmpfile('calls.vcf')
    FLAGS.nonvariant_site_tfrecord_path = (
        testdata.GOLDEN_POSTPROCESS_GVCF_INPUT)
    FLAGS.gvcf_outfile = test_utils.test_tmpfile('gvcf_calls.vcf')

    postprocess_variants.main(['postprocess_variants.py'])

    self.assertEqual(
        tf.gfile.FastGFile(FLAGS.outfile).readlines(),
        tf.gfile.FastGFile(testdata.GOLDEN_POSTPROCESS_OUTPUT).readlines())
    self.assertEqual(
        tf.gfile.FastGFile(FLAGS.gvcf_outfile).readlines(),
        tf.gfile.FastGFile(testdata.GOLDEN_POSTPROCESS_GVCF_OUTPUT).readlines())
Exemple #35
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format(
            enabled, emit_reads))
        region_str = 'chr20:10046178-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046096-10046267'
        reads, header = _get_reads_and_header(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader, header)
        _, _ = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.io.gfile.exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.io.gfile.isdir(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.io.gfile.exists(metrics_file))
            with tf.io.gfile.GFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertLen(rows, 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.io.gfile.isdir(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.io.gfile.exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)

            # if emit_reads=False then file should not exist and vice versa.
            self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
 def _run():
   call_variants.call_variants(
       examples_filename=testdata.GOLDEN_CALLING_EXAMPLES,
       checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
       model=self.model,
       execution_hardware=hardware_env,
       max_batches=1,
       batch_size=1,
       output_file=test_utils.test_tmpfile('zzz.tfrecord'))
  def test_call_variants_with_invalid_format(self, model, bad_format):
    # Read one good record from a valid file.
    example = next(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))
    # Overwrite the image/format field to be an invalid value
    # (anything but 'raw').
    example.features.feature['image/format'].bytes_list.value[0] = bad_format
    source_path = test_utils.test_tmpfile('make_examples_output.tfrecord')
    io_utils.write_tfrecords([example], source_path)
    outfile = test_utils.test_tmpfile('call_variants_invalid_format.tfrecord')

    with self.assertRaises(ValueError):
      call_variants.call_variants(
          examples_filename=source_path,
          checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
          model=model,
          output_file=outfile,
          batch_size=1,
          max_batches=1)
Exemple #38
0
 def test_roundtrip_writer(self, filename):
   output_path = test_utils.test_tmpfile(filename)
   original_reader = sam.SamReader(test_utils.genomics_core_testdata(filename))
   original_records = list(original_reader.iterate())
   with sam.SamWriter(output_path, header=original_reader.header) as writer:
     for record in original_records:
       writer.write(record)
   with sam.SamReader(output_path) as new_reader:
     self.assertEqual(original_records, list(new_reader.iterate()))
 def _run():
     call_variants.call_variants(
         examples_filename=testdata.GOLDEN_CALLING_EXAMPLES,
         checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
         model=self.model,
         execution_hardware=hardware_env,
         max_batches=1,
         batch_size=1,
         output_file=test_utils.test_tmpfile('zzz.tfrecord'))
Exemple #40
0
  def test_make_examples_with_allele_frequency(self, mode):
    FLAGS.mode = 'calling'
    FLAGS.ref = testdata.GRCH38_FASTA
    FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM
    num_shards = 1
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    region = ranges.parse_literal('chr20:61001-62000')
    FLAGS.use_allele_frequency = True
    FLAGS.regions = [ranges.to_literal(region)]
    if mode == 'one vcf':
      FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21
    elif mode == 'two vcfs':
      FLAGS.population_vcfs = ' '.join(
          [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21])
    else:
      raise ValueError('Invalid mode for parameterized test.')
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=False)

    # Pileup images should have one extra channel.
    self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1],
                     decode_example(examples[0])['image/shape'])

    # Test there is something in the added channel.
    # Values capture whether each loci has been seen in the observed examples.
    population_matched_loci = {
        'chr20:61539_A': False,
        'chr20:61634_G': False,
        'chr20:61644_G': False
    }

    for example in examples:
      locus_id = vis.locus_id_from_variant(vis.variant_from_example(example))
      if locus_id in population_matched_loci.keys():
        channels = vis.channels_from_example(example)
        self.assertGreater(
            np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]),
            0,
            msg='There should be '
            'something in the %s-th channel for variant '
            '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id))
        population_matched_loci[locus_id] = True
    self.assertTrue(
        all(population_matched_loci.values()),
        msg='Check that all '
        '3 sample loci appeared in the examples.')

    # Check against the golden file (same for both modes).
    golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES)
    examples_from_golden = list(tfrecord.read_tfrecords(golden_file))
    self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
 def test_call_end2end_with_empty_shards(self):
   # Get only up to 10 examples.
   examples = list(
       io_utils.read_tfrecords(
           testdata.GOLDEN_CALLING_EXAMPLES, max_records=10))
   # Write to 15 shards, which means there will be multiple empty shards.
   source_path = test_utils.test_tmpfile('sharded@{}'.format(15))
   io_utils.write_tfrecords(examples, source_path)
   self.assertCallVariantsEmitsNRecordsForRandomGuess(source_path,
                                                      len(examples))
Exemple #42
0
  def test_roundtrip_writer(self, filename):
    output_path = test_utils.test_tmpfile(filename)
    with fastq.FastqWriter(output_path) as writer:
      for record in self.records:
        writer.write(record)

    with fastq.FastqReader(output_path) as reader:
      v2_records = list(reader.iterate())

    self.assertEqual(self.records, v2_records)
  def test_catches_bad_flags(self):
    # Set all of the requested flag values.
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
    FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    # This is the bad flag.
    FLAGS.confident_regions = ''

    with mock.patch.object(logging, 'error') as mock_logging,\
        mock.patch.object(sys, 'exit') as mock_exit:
      make_examples.main(['make_examples.py'])
    mock_logging.assert_called_once_with(
        'confident_regions is required when in training mode.')
    mock_exit.assert_called_once_with(errno.ENOENT)
def make_golden_dataset(compressed_inputs=False):
  if compressed_inputs:
    source_path = test_utils.test_tmpfile('make_golden_dataset.tfrecord.gz')
    io_utils.write_tfrecords(
        io_utils.read_tfrecords(testdata.GOLDEN_TRAINING_EXAMPLES), source_path)
  else:
    source_path = testdata.GOLDEN_TRAINING_EXAMPLES
  return data_providers.DeepVariantDataSet(
      name='labeled_golden',
      source=source_path,
      num_examples=testdata.N_GOLDEN_TRAINING_EXAMPLES)
 def test_call_variants_with_no_shape(self, model):
   # Read one good record from a valid file.
   example = next(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))
   # Remove image/shape.
   del example.features.feature['image/shape']
   source_path = test_utils.test_tmpfile('make_examples_out_noshape.tfrecord')
   io_utils.write_tfrecords([example], source_path)
   with self.assertRaisesRegexp(
       ValueError, 'Invalid image/shape: we expect to find an image/shape '
       'field with length 3.'):
     call_variants.prepare_inputs(source_path, model, batch_size=1)
def make_golden_dataset(compressed_inputs=False):
  if compressed_inputs:
    source_path = test_utils.test_tmpfile(
        'golden.postprocess_single_site_input.tfrecord.gz')
    io_utils.write_tfrecords(
        io_utils.read_tfrecords(
            testdata.GOLDEN_POSTPROCESS_INPUT,
            proto=deepvariant_pb2.CallVariantsOutput), source_path)
  else:
    source_path = testdata.GOLDEN_POSTPROCESS_INPUT
  return source_path
Exemple #47
0
 def _parse_read_with_aux_tags(self, tag_string):
   # Minimal header line to create a valid SAM file.
   header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n'
   # A single stock read we'll add our AUX fields to.
   read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string
   path = test_utils.test_tmpfile('aux_tags.bam')
   with gfile.FastGFile(path, 'w') as fout:
     fout.write(header_lines)
     fout.write(read + '\n')
   with sam.SamReader(
       path, use_index=False, parse_aux_fields=True) as reader:
     return list(reader.iterate())
 def test_catches_bad_argv(self):
   # Define valid flags to ensure raise occurs due to argv issues.
   FLAGS.infile = make_golden_dataset(False)
   FLAGS.ref = testdata.CHR20_FASTA
   FLAGS.outfile = test_utils.test_tmpfile('nonempty_outfile.vcf')
   with mock.patch.object(logging, 'error') as mock_logging,\
       mock.patch.object(sys, 'exit') as mock_exit:
     postprocess_variants.main(['postprocess_variants.py', 'extra_arg'])
   mock_logging.assert_called_once_with(
       'Command line parsing failure: postprocess_variants does not accept '
       'positional arguments but some are present on the command line: '
       '"[\'postprocess_variants.py\', \'extra_arg\']".')
   mock_exit.assert_called_once_with(errno.ENOENT)
 def test_call_variants_non_accelerated_execution_runs(self,
                                                       execution_hardware):
   # This doesn't mock out the list_devices call so it's worth keeping
   # despite being very similar to the parameterized test below.
   outfile = test_utils.test_tmpfile('call_variants_cpu_only.tfrecord')
   call_variants.call_variants(
       examples_filename=testdata.GOLDEN_CALLING_EXAMPLES,
       checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
       model=self.model,
       execution_hardware=execution_hardware,
       max_batches=1,
       batch_size=1,
       output_file=outfile)
 def assertCallVariantsEmitsNRecordsForRandomGuess(self, filename,
                                                   num_examples):
   outfile = test_utils.test_tmpfile('call_variants.tfrecord')
   model = modeling.get_model('random_guess')
   call_variants.call_variants(
       examples_filename=filename,
       checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
       model=model,
       output_file=outfile,
       batch_size=4,
       max_batches=None)
   call_variants_outputs = list(
       io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput))
   # Check that we have the right number of output protos.
   self.assertEqual(len(call_variants_outputs), num_examples)
Exemple #51
0
  def test_make_read_writer_tfrecords(self):
    outfile = test_utils.test_tmpfile('test.tfrecord')
    writer = sam.SamWriter(outfile, header=self.header)

    # Test that the writer is a context manager and that we can write a read to
    # it.
    with writer:
      writer.write(self.read1)
      writer.write(self.read2)

    # Our output should have exactly one read in it.
    self.assertEqual([self.read1, self.read2],
                     list(
                         io_utils.read_tfrecords(outfile,
                                                 proto=reads_pb2.Read)))
 def _run_tiny_training(self, model_name, dataset):
   with mock.patch(
       'deepvariant.data_providers.get_dataset'
   ) as mock_get_dataset:
     mock_get_dataset.return_value = dataset
     FLAGS.train_dir = test_utils.test_tmpfile(model_name)
     FLAGS.batch_size = 2
     FLAGS.model_name = model_name
     FLAGS.save_interval_secs = 0
     FLAGS.number_of_steps = 1
     FLAGS.dataset_config_pbtxt = '/path/to/mock.pbtxt'
     FLAGS.start_from_checkpoint = ''
     model_train.parse_and_run()
     # We have a checkpoint after training.
     mock_get_dataset.assert_called_once_with(FLAGS.dataset_config_pbtxt)
     self.assertIsNotNone(tf.train.latest_checkpoint(FLAGS.train_dir))
  def test_reading_sharded_dataset(self, compressed_inputs):
    golden_dataset = make_golden_dataset(compressed_inputs)
    n_shards = 3
    sharded_path = test_utils.test_tmpfile('sharded@{}'.format(n_shards))
    io_utils.write_tfrecords(
        io_utils.read_tfrecords(golden_dataset.source), sharded_path)

    config_file = _test_dataset_config(
        'test_sharded.pbtxt',
        name='sharded_test',
        tfrecord_path=sharded_path,
        num_examples=golden_dataset.num_examples)

    self.assertDataSetExamplesMatchExpected(
        data_providers.get_dataset(config_file).get_slim_dataset(),
        golden_dataset)
Exemple #54
0
 def write_variant_to_tempfile(self, variant):
   output_path = test_utils.test_tmpfile('test.vcf')
   header = variants_pb2.VcfHeader(
       contigs=[reference_pb2.ContigInfo(name='20')],
       sample_names=[call.call_set_name for call in variant.calls],
       formats=[
           variants_pb2.VcfFormatInfo(
               id='DP', number='1', type='Integer', description='Read depth'),
           variants_pb2.VcfFormatInfo(
               id='AD',
               number='R',
               type='Integer',
               description='Read depth for each allele')
       ])
   writer = vcf.VcfWriter(output_path, header=header)
   with writer:
     writer.write(variant)
   return output_path
  def test_prepare_inputs(self, filename, expand_to_file_pattern):
    source_path = test_utils.test_tmpfile(filename)
    io_utils.write_tfrecords(self.examples, source_path)
    if expand_to_file_pattern:
      # Transform foo@3 to foo-?????-of-00003.
      source_path = io_utils.NormalizeToShardedFilePattern(source_path)

    with self.test_session() as sess:
      _, variants, _ = call_variants.prepare_inputs(
          source_path, self.model, batch_size=1)
      sess.run(tf.local_variables_initializer())
      sess.run(tf.global_variables_initializer())

      seen_variants = []
      try:
        while True:
          seen_variants.extend(sess.run(variants))
      except tf.errors.OutOfRangeError:
        pass

      self.assertItemsEqual(self.variants,
                            variant_utils.decode_variants(seen_variants))
 def test_call_end2end_zero_record_file(self):
   zero_record_file = test_utils.test_tmpfile('zero_record_file')
   io_utils.write_tfrecords([], zero_record_file)
   self.assertCallVariantsEmitsNRecordsForRandomGuess(
       test_utils.test_tmpfile('zero_record_file'), 0)
  def test_call_end2end(self, model, shard_inputs, include_debug_info):
    FLAGS.include_debug_info = include_debug_info
    examples = list(io_utils.read_tfrecords(testdata.GOLDEN_CALLING_EXAMPLES))

    if shard_inputs:
      # Create a sharded version of our golden examples.
      source_path = test_utils.test_tmpfile('sharded@{}'.format(3))
      io_utils.write_tfrecords(examples, source_path)
    else:
      source_path = testdata.GOLDEN_CALLING_EXAMPLES

    batch_size = 4
    if model.name == 'random_guess':
      # For the random guess model we can run everything.
      max_batches = None
    else:
      # For all other models we only run a single batch for inference.
      max_batches = 1

    outfile = test_utils.test_tmpfile('call_variants.tfrecord')
    call_variants.call_variants(
        examples_filename=source_path,
        checkpoint_path=modeling.SKIP_MODEL_INITIALIZATION_IN_TEST,
        model=model,
        output_file=outfile,
        batch_size=batch_size,
        max_batches=max_batches)

    call_variants_outputs = list(
        io_utils.read_tfrecords(outfile, deepvariant_pb2.CallVariantsOutput))

    # Check that we have the right number of output protos.
    self.assertEqual(
        len(call_variants_outputs), batch_size * max_batches
        if max_batches else len(examples))

    # Check that our CallVariantsOutput (CVO) have the following critical
    # properties:
    # - we have one CVO for each example we processed.
    # - the variant in the CVO is exactly what was in the example.
    # - the alt_allele_indices of the CVO match those of its corresponding
    #   example.
    # - there are 3 genotype probabilities and these are between 0.0 and 1.0.
    # We can only do this test when processing all of the variants (max_batches
    # is None), since we processed all of the examples with that model.
    if max_batches is None:
      self.assertItemsEqual([cvo.variant for cvo in call_variants_outputs],
                            [tf_utils.example_variant(ex) for ex in examples])

    # Check the CVO debug_info: not filled if include_debug_info is False;
    # else, filled by logic based on CVO.
    if not include_debug_info:
      for cvo in call_variants_outputs:
        self.assertEqual(cvo.debug_info,
                         deepvariant_pb2.CallVariantsOutput.DebugInfo())
    else:
      for cvo in call_variants_outputs:
        self.assertEqual(cvo.debug_info.has_insertion,
                         variant_utils.has_insertion(cvo.variant))
        self.assertEqual(cvo.debug_info.has_deletion,
                         variant_utils.has_deletion(cvo.variant))
        self.assertEqual(cvo.debug_info.is_snp, variant_utils.is_snp(
            cvo.variant))
        self.assertEqual(cvo.debug_info.predicted_label,
                         np.argmax(cvo.genotype_probabilities))

    def example_matches_call_variants_output(example, call_variants_output):
      return (tf_utils.example_variant(example) == call_variants_output.variant
              and tf_utils.example_alt_alleles_indices(
                  example) == call_variants_output.alt_allele_indices.indices)

    for call_variants_output in call_variants_outputs:
      # Find all matching examples.
      matches = [
          ex for ex in examples
          if example_matches_call_variants_output(ex, call_variants_output)
      ]
      # We should have exactly one match.
      self.assertEqual(len(matches), 1)
      example = matches[0]
      # Check that we've faithfully copied in the alt alleles (though currently
      # as implemented we find our example using this information so it cannot
      # fail). Included here in case that changes in the future.
      self.assertEqual(
          list(tf_utils.example_alt_alleles_indices(example)),
          list(call_variants_output.alt_allele_indices.indices))
      # We should have exactly three genotype probabilities (assuming our
      # ploidy == 2).
      self.assertEqual(len(call_variants_output.genotype_probabilities), 3)
      # These are probabilities so they should be between 0 and 1.
      self.assertTrue(
          0 <= gp <= 1 for gp in call_variants_output.genotype_probabilities)
  def test_make_examples_end2end(self, mode, num_shards,
                                 labeler_algorithm=None):
    self.maxDiff = None
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      make_examples.make_examples_runner(options)

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        io_utils.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants(
        [call.variant for call in candidates], region, options, is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(io_utils.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      self.assertItemsEqual(gvcfs, expected_gvcfs)
 def test_call_variants_with_empty_input(self):
   source_path = test_utils.test_tmpfile('empty.tfrecord')
   io_utils.write_tfrecords([], source_path)
   # Make sure that prepare_inputs don't crash on empty input.
   call_variants.prepare_inputs(
       source_path, modeling.get_model('random_guess'), batch_size=1)
Exemple #60
0
 def write_test_protos(self, filename):
   protos = [reference_pb2.ContigInfo(name=str(i)) for i in range(10)]
   path = test_utils.test_tmpfile(filename)
   io.write_tfrecords(protos, path)
   return protos, path