Ejemplo n.º 1
0
    def test_adding_reads(self):
        aregion = _test_assembled_region('chr1:3-15')

        # We haven't added any reads, so reads is empty and the span is None.
        self.assertEqual(aregion.reads, [])
        self.assertIsNone(aregion.read_span)

        # Add read2, giving us a real read span and a read in our region's reads.
        read_to_add = self.get_reads_by_name(['read2'])[0]
        expected_reads = [read_to_add]
        aregion.add_read(read_to_add)
        self.assertEqual(aregion.reads, expected_reads)
        self.assertEqual(aregion.read_span, ranges.parse_literal('chr1:7-9'))

        # Add read1, increasing the span on the left.
        read_to_add = self.get_reads_by_name(['read1'])[0]
        expected_reads += [read_to_add]
        aregion.add_read(read_to_add)
        self.assertEqual(aregion.reads, expected_reads)
        self.assertEqual(aregion.read_span, ranges.parse_literal('chr1:2-9'))

        # Finally, add in all of the reads.
        reads_to_add = self.get_reads_by_name(['read3', 'read4', 'read5'])
        expected_reads += reads_to_add
        for read in reads_to_add:
            aregion.add_read(read)
        self.assertEqual(aregion.reads, expected_reads)
        self.assertEqual(aregion.read_span, ranges.parse_literal('chr1:2-31'))
Ejemplo n.º 2
0
    def test_realigner_example_variant(self, region_literal, variant_literal):
        """All overlapping reads should include 10bp deletion at chr20:10046178."""
        region = ranges.parse_literal(region_literal)
        variant = ranges.parse_literal(variant_literal)

        reads = _get_reads(region)
        _, realigned_reads = self.reads_realigner.realign_reads(reads, region)

        for read in realigned_reads:
            has_variant = False
            self.assertTrue(read.HasField('alignment'))
            self.assertEqual(variant.reference_name,
                             read.alignment.position.reference_name)
            ref_pos = read.alignment.position.position
            for cigar in read.alignment.cigar:
                self.assertIn(cigar.operation, utils.CIGAR_OPS)
                if cigar.operation in utils.CIGAR_ALIGN_OPS:
                    ref_pos += cigar.operation_length
                elif cigar.operation in utils.CIGAR_DELETE_OPS:
                    if (ref_pos == variant.start and cigar.operation_length
                            == variant.end - ref_pos):
                        has_variant = True
                    ref_pos += cigar.operation_length
            if (read.alignment.position.position <= variant.start
                    and ref_pos >= variant.end):
                self.assertTrue(has_variant)
Ejemplo n.º 3
0
 def test_parse_literal_one_bp(self):
   self.assertEqual(
       ranges.parse_literal('1:10'), ranges.make_range('1', 9, 10))
   self.assertEqual(
       ranges.parse_literal('1:100'), ranges.make_range('1', 99, 100))
   self.assertEqual(
       ranges.parse_literal('1:1,000'), ranges.make_range('1', 999, 1000))
 def test_query_raises_with_bad_range(self):
     with sam_reader.SamReader.from_file(self.bam,
                                         self.indexed_options) as reader:
         with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
             reader.query(ranges.parse_literal('XXX:1-10'))
         with self.assertRaisesRegexp(ValueError,
                                      'unknown reference interval'):
             reader.query(ranges.parse_literal('chr20:10-5'))
Ejemplo n.º 5
0
 def test_parse_literal_with_contig_map_and_bad_input_raises_exception(
     self, bad_literal):
   with self.assertRaises(ValueError):
     ranges.parse_literal(
         bad_literal,
         contig_map={
             'chr1': core_pb2.ContigInfo(name='chr1', n_bases=10)
         })
Ejemplo n.º 6
0
 def test_sam_query(self):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'))
   expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
               (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
   with reader:
     for interval, n_expected in expected:
       with reader.query(interval) as iterable:
         self.assertEqual(test_utils.iterable_len(iterable), n_expected)
Ejemplo n.º 7
0
 def test_query_raises_with_bad_range(self):
   with self.assertRaisesRegexp(ValueError, 'Unknown reference_name'):
     self.samples_reader.query(ranges.parse_literal('XXX:1-10'))
   with self.assertRaisesRegexp(ValueError, 'Malformed region'):
     self.samples_reader.query(ranges.parse_literal('chr1:0-5'))
   with self.assertRaisesRegexp(ValueError, 'Malformed region'):
     self.samples_reader.query(ranges.parse_literal('chr1:6-5'))
   with self.assertRaisesRegexp(ValueError, 'Malformed region'):
     self.samples_reader.query(ranges.parse_literal('chr1:10-5'))
 def test_bam_query(self):
     reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
     expected = [(ranges.parse_literal('chr20:10,000,000-10,000,100'), 106),
                 (ranges.parse_literal('chr20:10,000,000-10,000,000'), 45)]
     with reader:
         for interval, n_expected in expected:
             with reader.query(interval) as iterable:
                 self.assertIsInstance(iterable,
                                       clif_postproc.WrappedCppIterable)
                 self.assertEqual(test_utils.iterable_len(iterable),
                                  n_expected)
Ejemplo n.º 9
0
    def test_realigner_example_region(self, region_literal,
                                      expected_window_literal,
                                      expected_haplotypes, comment):
        region = ranges.parse_literal(region_literal)
        reads = _get_reads(region)
        windows_haplotypes, realigned_reads = self.reads_realigner.realign_reads(
            reads, region)

        self.assertEqual(len(reads), len(realigned_reads))
        self.assertEqual(ranges.parse_literal(expected_window_literal),
                         windows_haplotypes[0].span, comment)
        self.assertEqual(expected_haplotypes,
                         set(windows_haplotypes[0].haplotypes), comment)
Ejemplo n.º 10
0
 def test_parse_literal_with_contig_map(self, contig_name, expected):
   contig_map = {
       'chr1': core_pb2.ContigInfo(name='chr1', n_bases=10),
       'chr2': core_pb2.ContigInfo(name='chr2', n_bases=5),
   }
   self.assertEqual(
       ranges.parse_literal(contig_name, contig_map=contig_map), expected)
Ejemplo n.º 11
0
 def test_ops_on_closed_reader_raise(self):
   with self.samples_reader:
     pass
   # At this point the reader is closed.
   with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'):
     self.samples_reader.iterate()
   with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
     self.samples_reader.query(
         ranges.parse_literal('chr1:10,000,000-10,000,100'))
 def test_ops_on_closed_reader_raise(self):
     reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
     with reader:
         pass
     # At this point the reader is closed.
     with self.assertRaisesRegexp(ValueError, 'Cannot Iterate a closed'):
         reader.iterate()
     with self.assertRaisesRegexp(ValueError, 'Cannot Query a closed'):
         reader.query(ranges.parse_literal('chr20:10,000,000-10,000,100'))
    def test_straightforward_region(self):
        ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
        bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
        region = ranges.parse_literal('chr20:10,000,000-10,000,100')
        ref_seq = ref_reader.bases(region)

        all_reads = list(bam_reader.query(region))
        dbg30 = debruijn_graph.build(ref_seq, all_reads,
                                     self.single_k_dbg_options(30))
        self.assertIsNotNone(dbg30)
        self.assertEqual([ref_seq], dbg30.candidate_haplotypes())
Ejemplo n.º 14
0
 def test_downsampling(self, method, maybe_range, fraction, expected_n_reads):
   reader = genomics_io.make_sam_reader(
       test_utils.genomics_core_testdata('test.bam'),
       downsample_fraction=fraction,
       random_seed=12345)
   with reader:
     if method == 'iterate':
       reads_iter = reader.iterate()
     elif method == 'query':
       reads_iter = reader.query(ranges.parse_literal(maybe_range))
     else:
       self.fail('Unexpected method', method)
     self.assertEqual(test_utils.iterable_len(reads_iter), expected_n_reads)
 def test_context_manager(self):
     """Test that we can use context manager to do two queries in sequence."""
     reader = sam_reader.SamReader.from_file(self.bam, self.indexed_options)
     region = ranges.parse_literal('chr20:10,000,000-10,000,100')
     with reader:
         with reader.query(region) as query_iterable1:
             self.assertIsNotNone(query_iterable1)
             self.assertIsInstance(query_iterable1,
                                   clif_postproc.WrappedCppIterable)
         with reader.query(region) as query_iterable2:
             self.assertIsNotNone(query_iterable2)
             self.assertIsInstance(query_iterable2,
                                   clif_postproc.WrappedCppIterable)
 def test_complex_region(self):
     # There is a heterozygous 9 bp deletion of tandem TGA repeat.
     # "chr20:10,095,379-10,095,500"
     ref_reader = genomics_io.make_ref_reader(test_utils.CHR20_FASTA)
     bam_reader = genomics_io.make_sam_reader(test_utils.CHR20_BAM)
     region = ranges.parse_literal('chr20:10,095,379-10,095,500')
     ref_seq = ref_reader.bases(region)
     reads = list(bam_reader.query(region))
     dbg = debruijn_graph.build(ref_seq, reads, self.dbg_options())
     self.assertIsNotNone(dbg)
     self.assertEqual(44, dbg.kmer_size)
     self.assertEqual(2, len(dbg.candidate_haplotypes()))
     self.assertIn(ref_seq, dbg.candidate_haplotypes())
    def setUp(self):
        self.region = ranges.parse_literal('chr20:10,000,000-10,000,100')

        FLAGS.reads = ''
        self.options = make_examples.default_options(add_flags=False)
        self.options.reference_filename = test_utils.CHR20_FASTA
        self.options.reads_filename = test_utils.CHR20_BAM
        self.options.truth_variants_filename = test_utils.TRUTH_VARIANTS_VCF
        self.options.mode = deepvariant_pb2.DeepVariantOptions.TRAINING

        self.processor = make_examples.RegionProcessor(self.options)
        self.mock_init = self.add_mock('_initialize')
        self.default_shape = [5, 5, 7]
        self.default_format = 'raw'
Ejemplo n.º 18
0
    def test_realigner_doesnt_create_invalid_intervals(self):
        """Tests that read sets don't result in a crash in reference_fai.cc."""
        read = test_utils.make_read('ACCGT' * 50,
                                    start=63025520 - 250,
                                    cigar='250M',
                                    quals=range(30, 35) * 50,
                                    name='read1')
        reads = [read] * 20
        region = ranges.parse_literal('chr20:63,025,320-63,025,520')
        self.reads_realigner.realign_reads(reads, region)

        # These reads are aligned off the edge of the contig.
        read = test_utils.make_read('TTATA' * 50,
                                    start=63025520 - 200,
                                    cigar='200M50S',
                                    quals=range(30, 35) * 50,
                                    name='read1')
        reads = [read] * 20
        self.reads_realigner.realign_reads(reads, region)
    def test_catches_bad_flags(self):
        # Set all of the requested flag values.
        region = ranges.parse_literal('chr20:10,000,000-10,010,000')
        FLAGS.ref = test_utils.CHR20_FASTA
        FLAGS.reads = test_utils.CHR20_BAM
        FLAGS.candidates = test_utils.test_tmpfile('vsc.tfrecord')
        FLAGS.examples = test_utils.test_tmpfile('examples.tfrecord')
        FLAGS.regions = [ranges.to_literal(region)]
        FLAGS.partition_size = 1000
        FLAGS.mode = 'training'
        FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
        # This is the bad flag.
        FLAGS.confident_regions = ''

        with mock.patch.object(logging, 'error') as mock_logging,\
            mock.patch.object(sys, 'exit') as mock_exit:
            make_examples.main(['make_examples.py'])
        mock_logging.assert_called_once_with(
            'confident_regions is required when in training mode.')
        mock_exit.assert_called_once_with(errno.ENOENT)
    def test_make_examples_end2end(self, mode, num_shards):
        self.assertIn(mode, {'calling', 'training'})
        region = ranges.parse_literal('chr20:10,000,000-10,010,000')
        FLAGS.ref = test_utils.CHR20_FASTA
        FLAGS.reads = test_utils.CHR20_BAM
        FLAGS.candidates = test_utils.test_tmpfile(
            _sharded('vsc.tfrecord', num_shards))
        FLAGS.examples = test_utils.test_tmpfile(
            _sharded('examples.tfrecord', num_shards))
        FLAGS.regions = [ranges.to_literal(region)]
        FLAGS.partition_size = 1000
        FLAGS.mode = mode

        if mode == 'calling':
            FLAGS.gvcf = test_utils.test_tmpfile(
                _sharded('gvcf.tfrecord', num_shards))
        else:
            FLAGS.truth_variants = test_utils.TRUTH_VARIANTS_VCF
            FLAGS.confident_regions = test_utils.CONFIDENT_REGIONS_BED

        for task_id in range(max(num_shards, 1)):
            FLAGS.task = task_id
            options = make_examples.default_options(add_flags=True)
            make_examples.make_examples_runner(options)

        # Test that our candidates are reasonable, calling specific helper functions
        # to check lots of properties of the output.
        candidates = _sort_candidates(
            io_utils.read_tfrecords(FLAGS.candidates,
                                    proto=deepvariant_pb2.DeepVariantCall))
        self.verify_deepvariant_calls(candidates, options)
        self.verify_variants([call.variant for call in candidates],
                             region,
                             options,
                             is_gvcf=False)

        # Verify that the variants in the examples are all good.
        examples = self.verify_examples(FLAGS.examples,
                                        region,
                                        options,
                                        verify_labels=mode == 'training')
        example_variants = [tf_utils.example_variant(ex) for ex in examples]
        self.verify_variants(example_variants, region, options, is_gvcf=False)

        # Verify the integrity of the examples and then check that they match our
        # golden labeled examples. Note we expect the order for both training and
        # calling modes to produce deterministic order because we fix the random
        # seed.
        if mode == 'calling':
            golden_file = _sharded(test_utils.GOLDEN_CALLING_EXAMPLES,
                                   num_shards)
        else:
            golden_file = _sharded(test_utils.GOLDEN_TRAINING_EXAMPLES,
                                   num_shards)
        self.assertDeepVariantExamplesEqual(
            examples, list(io_utils.read_tfrecords(golden_file)))

        if mode == 'calling':
            nist_reader = genomics_io.make_vcf_reader(
                test_utils.TRUTH_VARIANTS_VCF)
            nist_variants = list(nist_reader.query(region))
            self.verify_nist_concordance(example_variants, nist_variants)

            # Check the quality of our generated gvcf file.
            gvcfs = _sort_variants(
                io_utils.read_tfrecords(FLAGS.gvcf,
                                        proto=variants_pb2.Variant))
            self.verify_variants(gvcfs, region, options, is_gvcf=True)
            self.verify_contiguity(gvcfs, region)
 def test_query_on_unindexed_reader_raises(self):
     with sam_reader.SamReader.from_file(self.bam, self.options) as reader:
         with self.assertRaisesRegexp(ValueError,
                                      'Cannot query without an index'):
             reader.query(
                 ranges.parse_literal('chr20:10,000,000-10,000,100'))
Ejemplo n.º 22
0
 def test_parse_literal_bad(self, bad_literal):
   with self.assertRaises(ValueError):
     ranges.parse_literal(bad_literal)
Ejemplo n.º 23
0
 def test_parse_literal_numerics(self, literal, start_val, end_val):
   self.assertEqual(
       ranges.parse_literal(literal),
       ranges.make_range('chr1', start_val, end_val))
Ejemplo n.º 24
0
 def test_parse_literal_chromosomes(self, chrom):
   self.assertEqual(
       ranges.parse_literal(chrom + ':1-20'), ranges.make_range(chrom, 0, 20))
 def setUp(self):
   self.query_window = ranges.parse_literal(QUERY_WINDOW)
Ejemplo n.º 26
0
 def test_construction(self):
     aregion = _test_assembled_region('chr1:1-5', haplotypes=['A', 'C'])
     self.assertEqual(aregion.region, ranges.parse_literal('chr1:1-5'))
     self.assertEqual(aregion.haplotypes, ['A', 'C'])
     self.assertEqual(aregion.reads, [])
Ejemplo n.º 27
0
def _test_assembled_region(region_str, haplotypes=None):
    return realigner.AssemblyRegion(
        realigner_pb2.CandidateHaplotypes(
            span=ranges.parse_literal(region_str), haplotypes=haplotypes
            or []))
Ejemplo n.º 28
0
    def test_realigner_diagnostics(self, enabled, emit_reads):
        # Make sure that by default we aren't emitting any diagnostic outputs.
        dx_dir = test_utils.test_tmpfile('dx')
        region_str = 'chr20:10046179-10046188'
        region = ranges.parse_literal(region_str)
        assembled_region_str = 'chr20:10046109-10046257'
        reads = _get_reads(region)
        self.config = realigner.realigner_config(FLAGS)
        self.config.diagnostics.enabled = enabled
        self.config.diagnostics.output_root = dx_dir
        self.config.diagnostics.emit_realigned_reads = emit_reads
        self.reads_realigner = realigner.Realigner(self.config,
                                                   self.ref_reader)
        _, realigned_reads = self.reads_realigner.realign_reads(reads, region)
        self.reads_realigner.diagnostic_logger.close(
        )  # Force close all resources.

        if not enabled:
            # Make sure our diagnostic output isn't emitted.
            self.assertFalse(tf.gfile.Exists(dx_dir))
        else:
            # Our root directory exists.
            self.assertTrue(tf.gfile.IsDirectory(dx_dir))

            # We expect a realigner_metrics.csv in our rootdir with 1 entry in it.
            metrics_file = os.path.join(
                dx_dir,
                self.reads_realigner.diagnostic_logger.metrics_filename)
            self.assertTrue(tf.gfile.Exists(metrics_file))
            with tf.gfile.FastGFile(metrics_file) as fin:
                rows = list(csv.DictReader(fin))
                self.assertEqual(len(rows), 1)
                self.assertEqual(set(rows[0].keys()),
                                 {'window', 'k', 'n_haplotypes', 'time'})
                self.assertEqual(rows[0]['window'], assembled_region_str)
                self.assertEqual(int(rows[0]['k']), 25)
                self.assertTrue(int(rows[0]['n_haplotypes']), 2)
                # Check that our runtime is reasonable (greater than 0, less than 10 s).
                self.assertTrue(0.0 < float(rows[0]['time']) < 10.0)

            # As does the subdirectory for this region.
            region_subdir = os.path.join(dx_dir, assembled_region_str)
            self.assertTrue(tf.gfile.IsDirectory(region_subdir))

            # We always have a graph.dot
            self.assertTrue(
                tf.gfile.Exists(
                    os.path.join(
                        region_subdir, self.reads_realigner.diagnostic_logger.
                        graph_filename)))

            reads_file = os.path.join(
                dx_dir, region_str, self.reads_realigner.diagnostic_logger.
                realigned_reads_filename)
            if emit_reads:
                self.assertTrue(tf.gfile.Exists(reads_file))
                reads_from_dx = io_utils.read_tfrecords(
                    reads_file, reads_pb2.Read)
                self.assertCountEqual(reads_from_dx, realigned_reads)
            else:
                self.assertFalse(tf.gfile.Exists(reads_file))
Ejemplo n.º 29
0
 def test_vcf_query(self):
   range1 = ranges.parse_literal('chr3:100,000-500,000')
   iterable = self.samples_reader.query(range1)
   self.assertEqual(test_utils.iterable_len(iterable), 4)
Ejemplo n.º 30
0
 def test_query_on_unindexed_reader_raises(self):
   with vcf_reader.VcfReader.from_file(self.samples_vcf,
                                       self.unindexed_options) as reader:
     with self.assertRaisesRegexp(ValueError, 'Cannot query without an index'):
       reader.query(ranges.parse_literal('chr1:10,000,000-10,000,100'))