def test_query_raises_with_bad_range(self):
   with sam_reader.SamReader.from_file(self.bam,
                                       self.indexed_options) as reader:
     with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
       reader.query(ranges.parse_literal('XXX:1-10'))
     with self.assertRaisesRegexp(ValueError, 'unknown reference interval'):
       reader.query(ranges.parse_literal('chr20:10-5'))
 def test_bam_query(self):
   reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertIsInstance(iterable, clif_postproc.WrappedCppIterable)
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
Beispiel #3
0
 def test_sam_query(self):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
 def test_query_raises_with_bad_range(self):
   with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
     self.samples_reader.query(ranges.parse_literal('XXX:1-10'))
   with self.assertRaisesRegexp(ValueError, 'Malformed region'):
     self.samples_reader.query(ranges.parse_literal('chr1:0-5'))
   with self.assertRaisesRegexp(ValueError, 'Malformed region'):
     self.samples_reader.query(ranges.parse_literal('chr1:6-5'))
   with self.assertRaisesRegexp(ValueError, 'Malformed region'):
     self.samples_reader.query(ranges.parse_literal('chr1:10-5'))
Beispiel #5
0
 def test_query_on_unindexed_reader_raises(self):
     window = ranges.parse_literal('chr1:10,000,000-10,000,100')
     unindexed_file = test_utils.genomics_core_testdata('test_samples.vcf')
     with vcf_reader.VcfReader.from_file(unindexed_file,
                                         self.options) as reader:
         with self.assertRaisesRegexp(ValueError,
                                      'Cannot query without an index'):
             reader.query(window)
Beispiel #6
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx_enabled{}_emitreads_{}'.format(
            enabled, emit_reads))
        region_str = 'chr20:10046178-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046096-10046267'
        reads, header = _get_reads_and_header(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader, header)
        _, _ = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.io.gfile.exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.io.gfile.isdir(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.io.gfile.exists(metrics_file))
            with tf.io.gfile.GFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertLen(rows, 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.io.gfile.isdir(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.io.gfile.exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)

            # if emit_reads=False then file should not exist and vice versa.
            self.assertEqual(emit_reads, tf.io.gfile.exists(reads_file))
 def test_ops_on_closed_reader_raise(self):
   reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
   with reader:
     pass
   # At this point the reader is closed.
   with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'):
     reader.iterate()
   with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
     reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
 def test_ops_on_closed_reader_raise(self):
     reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
     with reader:
         pass
     # At this point the reader is closed.
     with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'):
         reader.iterate()
     with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
         reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
Beispiel #9
0
  def test_fail_multiple_concurrent_iterations(self):
    range1 = ranges.parse_literal('chr3:100,000-500,000')
    reads = self.samples_reader.query(range1)
    for read in reads:
      pass

    r2 = self.samples_reader.query(range1)
    with self.assertRaisesRegexp(ValueError, 'No underlying iterable. This '):
      next(r2)
Beispiel #10
0
  def test_vcf_query(self):
    tabix.build_index(self.output_file)
    self.input_reader = vcf.VcfReader(self.input_file)
    self.output_reader = vcf.VcfReader(self.output_file)

    range1 = ranges.parse_literal('chr3:100,000-500,000')
    self.assertEqual(
        list(self.input_reader.query(range1)),
        list(self.output_reader.query(range1)))
Beispiel #11
0
  def test_make_examples_with_allele_frequency(self, mode):
    FLAGS.mode = 'calling'
    FLAGS.ref = testdata.GRCH38_FASTA
    FLAGS.reads = testdata.GRCH38_CHR20_AND_21_BAM
    num_shards = 1
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    region = ranges.parse_literal('chr20:61001-62000')
    FLAGS.use_allele_frequency = True
    FLAGS.regions = [ranges.to_literal(region)]
    if mode == 'one vcf':
      FLAGS.population_vcfs = testdata.AF_VCF_CHR20_AND_21
    elif mode == 'two vcfs':
      FLAGS.population_vcfs = ' '.join(
          [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21])
    else:
      raise ValueError('Invalid mode for parameterized test.')
    options = make_examples.default_options(add_flags=True)
    # Run make_examples with the flags above.
    make_examples_core.make_examples_runner(options)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=False)

    # Pileup images should have one extra channel.
    self.assertEqual([100, 221, dv_constants.PILEUP_NUM_CHANNELS + 1],
                     decode_example(examples[0])['image/shape'])

    # Test there is something in the added channel.
    # Values capture whether each loci has been seen in the observed examples.
    population_matched_loci = {
        'chr20:61539_A': False,
        'chr20:61634_G': False,
        'chr20:61644_G': False
    }

    for example in examples:
      locus_id = vis.locus_id_from_variant(vis.variant_from_example(example))
      if locus_id in population_matched_loci.keys():
        channels = vis.channels_from_example(example)
        self.assertGreater(
            np.sum(channels[dv_constants.PILEUP_NUM_CHANNELS]),
            0,
            msg='There should be '
            'something in the %s-th channel for variant '
            '%s' % (dv_constants.PILEUP_NUM_CHANNELS + 1, locus_id))
        population_matched_loci[locus_id] = True
    self.assertTrue(
        all(population_matched_loci.values()),
        msg='Check that all '
        '3 sample loci appeared in the examples.')

    # Check against the golden file (same for both modes).
    golden_file = _sharded(testdata.GOLDEN_ALLELE_FREQUENCY_EXAMPLES)
    examples_from_golden = list(tfrecord.read_tfrecords(golden_file))
    self.assertDeepVariantExamplesEqual(examples_from_golden, examples)
 def test_query_without_index_raises(self, unindexed_file_name):
     path = test_utils.genomics_core_testdata(unindexed_file_name)
     window = ranges.parse_literal('chr20:10,000,000-10,000,100')
     with sam_reader.SamReader.from_file(reads_path=path,
                                         ref_path='',
                                         options=self.options) as reader:
         with self.assertRaisesRegexp(ValueError,
                                      'Cannot query without an index'):
             reader.query(window)
Beispiel #13
0
 def test_ops_on_closed_reader_raise(self):
   with self.samples_reader:
     pass
   # At this point the reader is closed.
   with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'):
     self.samples_reader.iterate()
   with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
     self.samples_reader.query(
         ranges.parse_literal('chr1:10,000,000-10,000,100'))
 def test_context_manager(self):
   """Test that we can use context manager to do two queries in sequence."""
   reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
   region = ranges.parse_literal('chr20:10,000,000-10,000,100')
   with reader:
     with reader.query(region) as query_iterable1:
       self.assertIsNotNone(query_iterable1)
       self.assertIsInstance(query_iterable1, clif_postproc.WrappedCppIterable)
     with reader.query(region) as query_iterable2:
       self.assertIsNotNone(query_iterable2)
       self.assertIsInstance(query_iterable2, clif_postproc.WrappedCppIterable)
  def test_straightforward_region(self):
    ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA)
    bam_reader = sam.SamReader(testdata.CHR20_BAM)
    region = ranges.parse_literal('chr20:10,000,000-10,000,100')
    ref_seq = ref_reader.query(region)

    all_reads = list(bam_reader.query(region))
    dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                 self.single_k_dbg_options(30))
    self.assertIsNotNone(dbg30)
    self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
    def test_straightforward_region(self):
        ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
        bam_reader = sam.SamReader(testdata.CHR20_BAM)
        region = ranges.parse_literal('chr20:10,000,000-10,000,100')
        ref_seq = ref_reader.query(region)

        all_reads = list(bam_reader.query(region))
        dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                     self.single_k_dbg_options(30))
        self.assertIsNotNone(dbg30)
        self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
Beispiel #17
0
def model_evaluation_runner(truth_variants, reads, ref, input_model_pckl,
                            eval_region, output_report_csv):
    """Outputs precision-recall for a sklearn model using AlleleCount features.

  Args:
    truth_variants: path to the VCF.
    reads: path to the reads BAM.
    ref: path to the reference FASTA.
    input_model_pckl: path to read the LogisticRegression pickle from.
    eval_region: str, region to evaluate on in the 'chr:start-end',
      'chr:position' or 'chr' format.
    output_report_csv: path to the output report csv.

  Raises:
    ValueError: if eval_region cannot be parsed.
  """
    sam_reader = sam.SamReader(reads)
    ref_reader = fasta.IndexedFastaReader(ref)

    read_reqs = reads_pb2.ReadRequirements(
        min_base_quality=10,
        min_mapping_quality=10,
        min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT)
    allele_counter_options = deepvariant_pb2.AlleleCounterOptions(
        partition_size=1, read_requirements=read_reqs)

    model = joblib.load(input_model_pckl)

    with vcf.VcfReader(truth_variants) as vcf_reader:
        region = ranges.parse_literal(eval_region,
                                      contig_map=ranges.contigs_dict(
                                          ref_reader.header.contigs))
        true_indels = [
            var for var in vcf_reader.query(region)
            if (variant_utils.is_indel(var))
        ]

    precisions = compute_precision(model, true_indels, sam_reader, ref_reader,
                                   allele_counter_options, _THRESHOLDS, region)
    recalls = compute_effective_recall(model, true_indels, sam_reader,
                                       ref_reader, allele_counter_options,
                                       _THRESHOLDS)

    with tf.gfile.GFile(output_report_csv, 'w') as csvfile:
        fieldnames = ['threshold', 'precision', 'recall']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for threshold in _THRESHOLDS:
            writer.writerow({
                'threshold': threshold,
                'precision': precisions[threshold],
                'recall': recalls[threshold]
            })
Beispiel #18
0
 def test_context_manager(self):
   """Test that we can use context manager to do two queries in sequence."""
   reader = sam_reader.SamReader.from_file(
       reads_path=self.bam, ref_path='', options=self.options)
   region = ranges.parse_literal('chr20:10,000,000-10,000,100')
   with reader:
     with reader.query(region) as query_iterable1:
       self.assertIsNotNone(query_iterable1)
       self.assertIsInstance(query_iterable1, clif_postproc.WrappedCppIterable)
     with reader.query(region) as query_iterable2:
       self.assertIsNotNone(query_iterable2)
       self.assertIsInstance(query_iterable2, clif_postproc.WrappedCppIterable)
Beispiel #19
0
 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method', method)
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
 def test_complex_region(self):
   # There is a heterozygous 9 bp deletion of tandem TGA repeat.
   # "chr20:10,095,379-10,095,500"
   ref_reader = fasta.RefFastaReader(testdata.CHR20_FASTA)
   bam_reader = sam.SamReader(testdata.CHR20_BAM)
   region = ranges.parse_literal('chr20:10,095,379-10,095,500')
   ref_seq = ref_reader.query(region)
   reads = list(bam_reader.query(region))
   dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
   self.assertIsNotNone(dbg)
   self.assertEqual(44, dbg.kmer_size)
   self.assertEqual(2, len(dbg.candidate_haplotypes()))
   self.assertIn(ref_seq, dbg.candidate_haplotypes())
 def test_complex_region(self):
     # There is a heterozygous 9 bp deletion of tandem TGA repeat.
     # "chr20:10,095,379-10,095,500"
     ref_reader = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
     bam_reader = sam.SamReader(testdata.CHR20_BAM)
     region = ranges.parse_literal('chr20:10,095,379-10,095,500')
     ref_seq = ref_reader.query(region)
     reads = list(bam_reader.query(region))
     dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
     self.assertIsNotNone(dbg)
     self.assertEqual(44, dbg.kmer_size)
     self.assertEqual(2, len(dbg.candidate_haplotypes()))
     self.assertIn(ref_seq, dbg.candidate_haplotypes())
Beispiel #22
0
 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = sam.SamReader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method ' + str(method))
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
  def setUp(self):
    self.region = ranges.parse_literal('chr20:10,000,000-10,000,100')

    FLAGS.reads = ''
    self.options = make_examples.default_options(add_flags=False)
    self.options.reference_filename = testdata.CHR20_FASTA
    self.options.reads_filename = testdata.CHR20_BAM
    self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF
    self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING

    self.processor = make_examples.RegionProcessor(self.options)
    self.mock_init = self.add_mock('_initialize')
    self.default_shape = [5, 5, 7]
    self.default_format = 'raw'
Beispiel #24
0
    def test_align_to_all_haplotypes(self, window_width):
        # align_to_all_haplotypes() will pull from the reference, so choose a
        # real variant.
        region = ranges.parse_literal('chr20:10,046,000-10,046,400')
        nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
        nist_variants = list(nist_reader.query(region))
        # We picked this region to have exactly one known variant:
        # reference_bases: "AAGAAAGAAAG"
        # alternate_bases: "A", a deletion of 10 bp
        # start: 10046177
        # end: 10046188
        # reference_name: "chr20"

        variant = nist_variants[0]

        self.processor.pic = mock.Mock()
        self.processor.pic.width = window_width
        self.processor.pic.half_width = int((self.processor.pic.width - 1) / 2)

        self.processor.realigner = mock.Mock()
        # Using a real ref_reader to test that the reference allele matches
        # between the variant and the reference at the variant's coordinates.
        self.processor.realigner.ref_reader = self.ref_reader

        read = test_utils.make_read('A' * 101,
                                    start=10046100,
                                    cigar='101M',
                                    quals=[30] * 101)

        self.processor.realigner.align_to_haplotype = mock.Mock()
        alt_info = self.processor.align_to_all_haplotypes(variant, [read])
        hap_alignments = alt_info['alt_alignments']
        hap_sequences = alt_info['alt_sequences']
        # Both outputs are keyed by alt allele.
        self.assertCountEqual(hap_alignments.keys(), ['A'])
        self.assertCountEqual(hap_sequences.keys(), ['A'])

        # Sequence must be the length of the window.
        self.assertLen(hap_sequences['A'], self.processor.pic.width)

        # align_to_haplotype should be called once for each alt (1 alt here).
        self.processor.realigner.align_to_haplotype.assert_called_once()

        # If variant reference_bases are wrong, it should raise a ValueError.
        variant.reference_bases = 'G'
        with six.assertRaisesRegex(
                self, ValueError, 'does not match the bases in the reference'):
            self.processor.align_to_all_haplotypes(variant, [read])
Beispiel #25
0
  def test_make_examples_end2end_failed_on_cram(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.use_ref_for_cram = False
    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_CRAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('failed.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('failed.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    with six.assertRaisesRegex(self, ValueError,
                               'Failed to parse BAM/CRAM file.'):
      make_examples_core.make_examples_runner(options)
    def test_realigner_doesnt_create_invalid_intervals(self):
        """Tests that read sets don't result in a crash in reference_fai.cc."""
        read = test_utils.make_read('ACCGT' * 50,
                                    start=63025520 - 250,
                                    cigar='250M',
                                    quals=range(30, 35) * 50,
                                    name='read1')
        reads = [read] * 20
        region = ranges.parse_literal('chr20:63,025,320-63,025,520')
        self.reads_realigner.realign_reads(reads, region)

        # These reads are aligned off the edge of the contig.
        read = test_utils.make_read('TTATA' * 50,
                                    start=63025520 - 200,
                                    cigar='200M50S',
                                    quals=range(30, 35) * 50,
                                    name='read1')
        reads = [read] * 20
        self.reads_realigner.realign_reads(reads, region)
Beispiel #27
0
  def test_make_examples_end2end_confirm_downsample_fraction_used(self):

    def _get_examples(downsample_fraction=None):
      if downsample_fraction is not None:
        FLAGS.downsample_fraction = downsample_fraction
      options = make_examples.default_options(add_flags=True)
      make_examples_core.make_examples_runner(options)
      examples = self.verify_examples(
          FLAGS.examples, region, options, verify_labels=False)
      return examples

    region = ranges.parse_literal('chr20:10,000,000-10,004,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.mode = 'calling'
    examples1 = _get_examples()
    examples2 = _get_examples(0.01)
    self.assertLess(len(examples2), len(examples1))
Beispiel #28
0
  def test_catches_bad_flags(self):
    # Set all of the requested flag values.
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
    FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    # This is the bad flag.
    FLAGS.confident_regions = ''

    with mock.patch.object(logging, 'error') as mock_logging,\
        mock.patch.object(sys, 'exit') as mock_exit:
      make_examples.main(['make_examples.py'])
    mock_logging.assert_called_once_with(
        'confident_regions is required when in training mode.')
    mock_exit.assert_called_once_with(errno.ENOENT)
Beispiel #29
0
  def test_make_examples_with_variant_selection(self,
                                                select_types,
                                                expected_count,
                                                keep_legacy_behavior=False):
    if select_types is not None:
      FLAGS.select_variant_types = select_types
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.keep_legacy_allele_counter_behavior = keep_legacy_behavior
    options = make_examples.default_options(add_flags=True)
    make_examples_core.make_examples_runner(options)

    candidates = list(tfrecord.read_tfrecords(FLAGS.candidates))
    self.assertLen(candidates, expected_count)
  def test_catches_bad_flags(self):
    # Set all of the requested flag values.
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
    FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'training'
    FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
    # This is the bad flag.
    FLAGS.confident_regions = ''

    with mock.patch.object(logging, 'error') as mock_logging,\
        mock.patch.object(sys, 'exit') as mock_exit:
      make_examples.main(['make_examples.py'])
    mock_logging.assert_called_once_with(
        'confident_regions is required when in training mode.')
    mock_exit.assert_called_once_with(errno.ENOENT)
Beispiel #31
0
  def test_make_examples_end2end_failed_on_mismatched_multi_bam(self):
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')

    FLAGS.write_run_info = True
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = ','.join([testdata.CHR20_BAM, testdata.NOCHR20_BAM])
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.vsc.tfrecord'))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('mismatched_multi_bam.examples.tfrecord'))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = 'calling'
    FLAGS.gvcf_gq_binsize = 5
    options = make_examples.default_options(add_flags=True)
    # This shows an example of what the error message looks like:
    # redacted
    with six.assertRaisesRegex(
        self, ValueError, 'NOT_FOUND: Unknown reference_name '
        'reference_name: "chr20" start: 9999999 end: 10000999'):
      make_examples_core.make_examples_runner(options)
Beispiel #32
0
    def test_realigner_doesnt_create_invalid_intervals(self):
        """Tests that read sets don't result in a crash in reference_fai.cc."""
        region = ranges.parse_literal('chr20:63,025,320-63,025,520')

        reads = [
            test_utils.make_read('ACCGT' * 50,
                                 start=63025520 - 250,
                                 cigar='250M',
                                 quals=range(30, 35) * 50) for _ in range(20)
        ]
        self.reads_realigner.realign_reads(reads, region)

        # These reads are aligned off the edge of the contig. Note that the
        # reference bases in this interval are all Ns as well.
        reads = [
            test_utils.make_read('TTATA' * 50,
                                 start=63025520 - 200,
                                 cigar='200M50S',
                                 quals=range(30, 35) * 50) for _ in range(20)
        ]
        self.reads_realigner.realign_reads(reads, region)
    def test_get_truth_variants(self):
        v1 = test_utils.make_variant(chrom='1', start=10)
        v2 = test_utils.make_variant(chrom='1', start=20)
        v3_filtered = test_utils.make_variant(chrom='1',
                                              start=30,
                                              filters=['FAIL'])
        v4_del = test_utils.make_variant(chrom='1',
                                         start=40,
                                         alleles=['AAAA', 'A'])
        v5_non_confident = test_utils.make_variant(chrom='1', start=150)

        variants = [v1, v2, v3_filtered, v4_del, v5_non_confident]
        reader = vcf.InMemoryVcfReader(variants=variants)
        confident_regions = ranges.RangeSet([ranges.make_range('1', 1, 100)])
        labeler = PlaceholderVariantLabeler(
            truth_vcf_reader=reader, confident_regions=confident_regions)

        # Check that we get v1 and v2 specifically when only they are covered by the
        # query.
        self.assertEqual(
            list(labeler._get_truth_variants(ranges.parse_literal('1:1-15'))),
            [v1])
        self.assertEqual(
            list(labeler._get_truth_variants(ranges.parse_literal('1:15-25'))),
            [v2])

        # We don't include filtered variants.
        self.assertEqual(
            list(labeler._get_truth_variants(ranges.parse_literal('1:25-35'))),
            [])

        # Check that we get all overlapping variants of our query.
        for del_query in ['1:35-45', '1:42-43', '1:38-42', '1:42-50']:
            self.assertEqual(
                list(
                    labeler._get_truth_variants(
                        ranges.parse_literal(del_query))), [v4_del])

        # Checks that a simple query gets all our non-filtered variants.
        self.assertEqual(
            list(labeler._get_truth_variants(ranges.parse_literal('1:1-100'))),
            [v1, v2, v4_del])
        # Even through our query covers v5, it's not confident, so we don't get it.
        self.assertEqual(
            list(labeler._get_truth_variants(
                ranges.parse_literal('1:1-1000'))), [v1, v2, v4_del])
Beispiel #34
0
    def setUp(self):
        super(RegionProcessorTest, self).setUp()
        self._saved_flags = flagsaver.save_flag_values()
        self.region = ranges.parse_literal('chr20:10,000,000-10,000,100')

        FLAGS.reads = ''
        self.options = make_examples.default_options(add_flags=False)
        self.options.reference_filename = testdata.CHR20_FASTA
        main_sample = self.options.sample_options[0]
        if not main_sample.reads_filenames:
            main_sample.reads_filenames.append(testdata.CHR20_BAM)
        main_sample.variant_caller_options.sample_name = 'sample_id'
        main_sample.name = 'sample_id'
        self.options.truth_variants_filename = testdata.TRUTH_VARIANTS_VCF
        self.options.mode = deepvariant_pb2.MakeExamplesOptions.TRAINING
        self.processor = make_examples_core.RegionProcessor(self.options)
        self.ref_reader = fasta.IndexedFastaReader(
            self.options.reference_filename)
        self.mock_init = self.add_mock('initialize')
        for sample in self.processor.samples:
            sample.in_memory_sam_reader = mock.Mock()
        self.default_shape = [5, 5, 7]
        self.default_format = 'raw'
Beispiel #35
0
 def test_make_examples_training_end2end_with_customized_classes_labeler(self):
   FLAGS.labeler_algorithm = 'customized_classes_labeler'
   FLAGS.customized_classes_labeler_classes_list = 'ref,class1,class2'
   FLAGS.customized_classes_labeler_info_field_name = 'type'
   region = ranges.parse_literal('chr20:10,000,000-10,004,000')
   FLAGS.regions = [ranges.to_literal(region)]
   FLAGS.ref = testdata.CHR20_FASTA
   FLAGS.reads = testdata.CHR20_BAM
   FLAGS.candidates = test_utils.test_tmpfile(_sharded('vsc.tfrecord'))
   FLAGS.examples = test_utils.test_tmpfile(_sharded('examples.tfrecord'))
   FLAGS.partition_size = 1000
   FLAGS.mode = 'training'
   FLAGS.gvcf_gq_binsize = 5
   FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF_WITH_TYPES
   FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED
   options = make_examples.default_options(add_flags=True)
   make_examples_core.make_examples_runner(options)
   golden_file = _sharded(testdata.CUSTOMIZED_CLASSES_GOLDEN_TRAINING_EXAMPLES)
   # Verify that the variants in the examples are all good.
   examples = self.verify_examples(
       FLAGS.examples, region, options, verify_labels=True)
   self.assertDeepVariantExamplesEqual(
       examples, list(tfrecord.read_tfrecords(golden_file)))
Beispiel #36
0
 def test_vcf_query(self):
   range1 = ranges.parse_literal('chr3:100,000-500,000')
   iterable = self.samples_reader.query(range1)
   self.assertEqual(test_utils.iterable_len(iterable), 4)
Beispiel #37
0
 def test_parse_literal_bad(self, bad_literal):
   with self.assertRaises(ValueError):
     ranges.parse_literal(bad_literal)
Beispiel #38
0
 def test_query(self, query, expected_variant_indices):
     range1 = ranges.parse_literal(query,
                                   ranges.contigs_dict(self.header.contigs))
     self.assertEqual(list(self.reader.query(range1)),
                      [self.variants[i] for i in expected_variant_indices])
Beispiel #39
0
 def test_query(self, query, expected_variant_indices):
   range1 = ranges.parse_literal(query, ranges.contigs_dict(
       self.header.contigs))
   self.assertEqual(
       list(self.reader.query(range1)),
       [self.variants[i] for i in expected_variant_indices])
Beispiel #40
0
 def test_query_on_unindexed_reader_raises(self):
   with vcf_reader.VcfReader.from_file(self.samples_vcf,
                                       self.unindexed_options) as reader:
     with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
       reader.query(ranges.parse_literal('chr1:10,000,000-10,000,100'))
Beispiel #41
0
 def test_construction(self):
   aregion = _test_assembled_region('chr1:1-5', haplotypes=['A', 'C'])
   self.assertEqual(aregion.region, ranges.parse_literal('chr1:1-5'))
   self.assertEqual(aregion.haplotypes, ['A', 'C'])
   self.assertEqual(aregion.reads, [])
Beispiel #42
0
def _test_assembled_region(region_str, haplotypes=None):
  return realigner.AssemblyRegion(
      realigner_pb2.CandidateHaplotypes(
          span=ranges.parse_literal(region_str), haplotypes=haplotypes or []))
  def test_make_examples_end2end(self, mode, num_shards,
                                 labeler_algorithm=None):
    self.maxDiff = None
    self.assertIn(mode, {'calling', 'training'})
    region = ranges.parse_literal('chr20:10,000,000-10,010,000')
    FLAGS.ref = testdata.CHR20_FASTA
    FLAGS.reads = testdata.CHR20_BAM
    FLAGS.candidates = test_utils.test_tmpfile(
        _sharded('vsc.tfrecord', num_shards))
    FLAGS.examples = test_utils.test_tmpfile(
        _sharded('examples.tfrecord', num_shards))
    FLAGS.regions = [ranges.to_literal(region)]
    FLAGS.partition_size = 1000
    FLAGS.mode = mode
    FLAGS.gvcf_gq_binsize = 5
    if labeler_algorithm is not None:
      FLAGS.labeler_algorithm = labeler_algorithm

    if mode == 'calling':
      FLAGS.gvcf = test_utils.test_tmpfile(
          _sharded('gvcf.tfrecord', num_shards))
    else:
      FLAGS.truth_variants = testdata.TRUTH_VARIANTS_VCF
      FLAGS.confident_regions = testdata.CONFIDENT_REGIONS_BED

    for task_id in range(max(num_shards, 1)):
      FLAGS.task = task_id
      options = make_examples.default_options(add_flags=True)
      make_examples.make_examples_runner(options)

    # Test that our candidates are reasonable, calling specific helper functions
    # to check lots of properties of the output.
    candidates = sorted(
        io_utils.read_tfrecords(
            FLAGS.candidates, proto=deepvariant_pb2.DeepVariantCall),
        key=lambda c: variant_utils.variant_range_tuple(c.variant))
    self.verify_deepvariant_calls(candidates, options)
    self.verify_variants(
        [call.variant for call in candidates], region, options, is_gvcf=False)

    # Verify that the variants in the examples are all good.
    examples = self.verify_examples(
        FLAGS.examples, region, options, verify_labels=mode == 'training')
    example_variants = [tf_utils.example_variant(ex) for ex in examples]
    self.verify_variants(example_variants, region, options, is_gvcf=False)

    # Verify the integrity of the examples and then check that they match our
    # golden labeled examples. Note we expect the order for both training and
    # calling modes to produce deterministic order because we fix the random
    # seed.
    if mode == 'calling':
      golden_file = _sharded(testdata.GOLDEN_CALLING_EXAMPLES, num_shards)
    else:
      golden_file = _sharded(testdata.GOLDEN_TRAINING_EXAMPLES, num_shards)
    self.assertDeepVariantExamplesEqual(
        examples, list(io_utils.read_tfrecords(golden_file)))

    if mode == 'calling':
      nist_reader = vcf.VcfReader(testdata.TRUTH_VARIANTS_VCF)
      nist_variants = list(nist_reader.query(region))
      self.verify_nist_concordance(example_variants, nist_variants)

      # Check the quality of our generated gvcf file.
      gvcfs = variant_utils.sorted_variants(
          io_utils.read_tfrecords(FLAGS.gvcf, proto=variants_pb2.Variant))
      self.verify_variants(gvcfs, region, options, is_gvcf=True)
      self.verify_contiguity(gvcfs, region)
      gvcf_golden_file = _sharded(testdata.GOLDEN_POSTPROCESS_GVCF_INPUT,
                                  num_shards)
      expected_gvcfs = list(
          io_utils.read_tfrecords(gvcf_golden_file, proto=variants_pb2.Variant))
      self.assertItemsEqual(gvcfs, expected_gvcfs)
 def test_query_on_unindexed_reader_raises(self):
   with sam_reader.SamReader.from_file(self.bam, self.options) as reader:
     with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
       reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))