コード例 #1
0
 def test_read_range(self, update_cached_read_end_first):
     """Tests reads have their ranges calculated correctly."""
     start = 10000001
     read = test_utils.make_read('AAACAG',
                                 chrom='chrX',
                                 start=start,
                                 cigar='2M1I3M',
                                 quals=range(10, 16),
                                 name='read1')
     if update_cached_read_end_first:
         # Explicitly update cached_end.
         read.cached_end = utils.read_end(read, use_cached_read_end=False)
     self.assertEqual(ranges.make_range('chrX', start, start + 5),
                      utils.read_range(read))
     read = test_utils.make_read('AAACAG',
                                 chrom='chrX',
                                 start=start,
                                 cigar='2M16D3M',
                                 quals=range(10, 16),
                                 name='read1')
     if update_cached_read_end_first:
         # Explicitly update cached_end.
         read.cached_end = utils.read_end(read, use_cached_read_end=False)
     self.assertEqual(ranges.make_range('chrX', start, start + 5 + 16),
                      utils.read_range(read))
コード例 #2
0
 def test_candidates_to_windows_min_window_distance(self, distance):
     candidates = [
         # We one candidate at position 100 with a 5 count.
         100,
         # We have another candidate at outside of our distance with a 5 count,
         # so it should produce a candidate but not be joined with our our
         # candidate at 100.
         100 - 2 * distance - 1,
         # Finally, we have another variant that is exactly distance away from
         # 100. It should be joined with the candidate at 100 to produce a single
         # larger window.
         100 + distance
     ]
     expected = [
         # Our first window is for the 100 - 2 * distance one.
         ranges.make_range('ref', 100 - 3 * distance - 1,
                           100 - distance - 1),
         # Our second window starts at 100 (- distance for the window size) and
         # ends at 100 + distance + distance (again for window size).
         ranges.make_range('ref', 100 - distance, 100 + 2 * distance),
     ]
     self.config.min_windows_distance = distance
     self.assertEqual(
         window_selector._candidates_to_windows(self.config, candidates,
                                                'ref'), expected)
コード例 #3
0
ファイル: realigner.py プロジェクト: kong75/deepvariant
  def call_aligner(self, assembled_region):
    """Helper function to call aligner module."""
    if not assembled_region.reads:
      return []

    contig = assembled_region.region.reference_name
    ref_start = max(
        0,
        min(assembled_region.read_span.start, assembled_region.region.start) -
        _REF_ALIGN_MARGIN)
    ref_end = min(
        self.ref_reader.contig(contig).n_bases,
        max(assembled_region.read_span.end, assembled_region.region.end) +
        _REF_ALIGN_MARGIN)

    ref_prefix = self.ref_reader.query(
        ranges.make_range(contig, ref_start, assembled_region.region.start))
    ref = self.ref_reader.query(assembled_region.region)

    # If we can't create the ref suffix then return the original alignments.
    if ref_end <= assembled_region.region.end:
      return assembled_region.reads
    else:
      ref_suffix = self.ref_reader.query(
          ranges.make_range(contig, assembled_region.region.end, ref_end))

    ref_region = ranges.make_range(contig, ref_start, ref_end)
    ref_seq = ref_prefix + ref + ref_suffix
    reads_aligner = aligner.Aligner(self.config.aln_config, ref_region, ref_seq)
    return reads_aligner.align_reads([
        ref_prefix + target + ref_suffix
        for target in assembled_region.haplotypes
    ], assembled_region.reads)
コード例 #4
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
 def test_parse_literal_one_bp(self):
     self.assertEqual(ranges.parse_literal('1:10'),
                      ranges.make_range('1', 9, 10))
     self.assertEqual(ranges.parse_literal('1:100'),
                      ranges.make_range('1', 99, 100))
     self.assertEqual(ranges.parse_literal('1:1,000'),
                      ranges.make_range('1', 999, 1000))
コード例 #5
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
    def test_envelops(self):
        start_ix = 5
        end_ix = 10
        start_ix2 = end_ix + 1
        end_ix2 = end_ix + 5
        range_set = ranges.RangeSet([
            ranges.make_range('chr1', start_ix, end_ix),
            ranges.make_range('chr1', start_ix2, end_ix2)
        ])

        # No start position before the first start range is enveloped.
        for i in range(start_ix):
            self.assertFalse(range_set.envelops('chr1', i, start_ix + 1))

        # All regions within a single record are enveloped.
        for six in range(start_ix, end_ix):
            for eix in range(six, end_ix + 1):
                self.assertTrue(range_set.envelops('chr1', six, eix),
                                'chr1 {} {} not enveloped'.format(six, eix))

        # Bridging across two ranges is not enveloped.
        for six in range(start_ix, end_ix):
            for eix in range(start_ix2, end_ix2 + 1):
                self.assertFalse(range_set.envelops('chr1', six, eix))

        # Other chromosome is not spanned.
        self.assertFalse(range_set.envelops('chr2', start_ix, start_ix + 1))
コード例 #6
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(
           literals, ranges.contigs_dict(_TEST_CONTIGS)))
コード例 #7
0
  def call_aligner(self, assembled_region):
    """Helper function to call aligner module."""
    if not assembled_region.reads:
      return []

    contig = assembled_region.region.reference_name
    ref_start = max(
        0,
        min(assembled_region.read_span.start, assembled_region.region.start) -
        _REF_ALIGN_MARGIN)
    ref_end = min(
        self.ref_reader.contig(contig).n_bases,
        max(assembled_region.read_span.end, assembled_region.region.end) +
        _REF_ALIGN_MARGIN)

    ref_prefix = self.ref_reader.query(
        ranges.make_range(contig, ref_start, assembled_region.region.start))
    ref = self.ref_reader.query(assembled_region.region)

    # If we can't create the ref suffix then return the original alignments.
    if ref_end <= assembled_region.region.end:
      return assembled_region.reads
    else:
      ref_suffix = self.ref_reader.query(
          ranges.make_range(contig, assembled_region.region.end, ref_end))

    ref_region = ranges.make_range(contig, ref_start, ref_end)
    ref_seq = ref_prefix + ref + ref_suffix
    reads_aligner = aligner.Aligner(self.config.aln_config, ref_region, ref_seq)
    return reads_aligner.align_reads([
        ref_prefix + target + ref_suffix
        for target in assembled_region.haplotypes
    ], assembled_region.reads)
コード例 #8
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
    def test_detector_ranges(self):
        test_ranges = [
            ranges.make_range('chr1', 0, 5),
            ranges.make_range('chr1', 8, 10),
            ranges.make_range('chr1', 12, 13),
            ranges.make_range('chr2', 2, 5),
        ]
        range_set = ranges.RangeSet(test_ranges)
        self.assertEqual(bool(range_set), True)
        self.assertEqual(len(range_set), 4)

        self.assertEqual(range_set.overlaps('chr1', 0), True)
        self.assertEqual(range_set.overlaps('chr1', 1), True)
        self.assertEqual(range_set.overlaps('chr1', 2), True)
        self.assertEqual(range_set.overlaps('chr1', 3), True)
        self.assertEqual(range_set.overlaps('chr1', 4), True)
        self.assertEqual(range_set.overlaps('chr1', 5), False)
        self.assertEqual(range_set.overlaps('chr1', 6), False)
        self.assertEqual(range_set.overlaps('chr1', 7), False)
        self.assertEqual(range_set.overlaps('chr1', 8), True)
        self.assertEqual(range_set.overlaps('chr1', 9), True)
        self.assertEqual(range_set.overlaps('chr1', 10), False)
        self.assertEqual(range_set.overlaps('chr1', 11), False)
        self.assertEqual(range_set.overlaps('chr1', 12), True)
        self.assertEqual(range_set.overlaps('chr1', 13), False)
        self.assertEqual(range_set.overlaps('chr1', 100), False)
        self.assertEqual(range_set.overlaps('chr1', 1000), False)
        self.assertEqual(range_set.overlaps('chr2', 0), False)
        self.assertEqual(range_set.overlaps('chr2', 1), False)
        self.assertEqual(range_set.overlaps('chr2', 2), True)
        self.assertEqual(range_set.overlaps('chr2', 3), True)
        self.assertEqual(range_set.overlaps('chr2', 4), True)
        self.assertEqual(range_set.overlaps('chr2', 5), False)
        self.assertEqual(range_set.overlaps('chr2', 6), False)
        self.assertEqual(range_set.overlaps('chr3', 3), False)
コード例 #9
0
 def test_partitions(self, interval_size, expected):
   rangeset = ranges.RangeSet([
       ranges.make_range('chrM', 0, 100),
       ranges.make_range('chr1', 0, 76),
       ranges.make_range('chr2', 0, 121),
   ])
   self.assertCountEqual([ranges.make_range(*args) for args in expected],
                         rangeset.partition(interval_size))
コード例 #10
0
 def test_partition_of_multiple_intervals(self, interval_size, expected):
   rangeset = ranges.RangeSet([
       ranges.make_range('1', 0, 10),
       ranges.make_range('1', 20, 40),
       ranges.make_range('1', 45, 50),
   ])
   self.assertCountEqual([ranges.make_range(*args) for args in expected],
                         rangeset.partition(interval_size))
コード例 #11
0
ファイル: fasta_test.py プロジェクト: zyxue/deepvariant
 def test_query_edge_cases(self):
     reader = fasta.InMemoryRefReader([('1', 0, 'ACGT')])
     # Check that we can query the first base correctly.
     self.assertEqual(reader.query(ranges.make_range('1', 0, 1)), 'A')
     # Check that we can query the last base correctly.
     self.assertEqual(reader.query(ranges.make_range('1', 3, 4)), 'T')
     # Check that we can query the entire sequence correctly.
     self.assertEqual(reader.query(ranges.make_range('1', 0, 4)), 'ACGT')
コード例 #12
0
 def test_from_bed(self, bed_filename):
   source = test_utils.genomics_core_testdata(bed_filename)
   self.assertCountEqual([
       ranges.make_range('chr1', 1, 10),
       ranges.make_range('chr2', 20, 30),
       ranges.make_range('chr2', 40, 60),
       ranges.make_range('chr3', 80, 90),
   ], ranges.RangeSet.from_bed(source))
コード例 #13
0
 def test_from_contigs(self):
   contigs = [
       reference_pb2.ContigInfo(name='chr1', n_bases=10),
       reference_pb2.ContigInfo(name='chr2', n_bases=5),
   ]
   self.assertCountEqual([
       ranges.make_range('chr1', 0, 10),
       ranges.make_range('chr2', 0, 5),
   ], ranges.RangeSet.from_contigs(contigs))
コード例 #14
0
  def test_find_max_overlapping_returns_least_index(self):
    query_range = ranges.make_range('1', 0, 10)
    search_ranges = [
        ranges.make_range('1', 0, 5),
        ranges.make_range('1', 5, 10)
    ]

    for to_search in [search_ranges, list(reversed(search_ranges))]:
      self.assertEqual(0, ranges.find_max_overlapping(query_range, to_search))
コード例 #15
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
 def test_partitions_bad_interval_size_raises(self):
     # list() is necessary to force the generator to execute.
     with self.assertRaisesRegexp(ValueError, 'max_size'):
         list(
             ranges.RangeSet([ranges.make_range('chrM', 0,
                                                100)]).partition(-10))
     with self.assertRaisesRegexp(ValueError, 'max_size'):
         list(
             ranges.RangeSet([ranges.make_range('chrM', 0,
                                                100)]).partition(0))
コード例 #16
0
 def test_from_regions_not_empty(self):
   literals = ['chr1', 'chr2:10-20']
   contig_map = {
       'chr1': reference_pb2.ContigInfo(name='chr1', n_bases=10),
       'chr2': reference_pb2.ContigInfo(name='chr2', n_bases=100),
   }
   self.assertItemsEqual(
       [ranges.make_range('chr1', 0, 10),
        ranges.make_range('chr2', 9, 20)],
       ranges.RangeSet.from_regions(literals, contig_map))
コード例 #17
0
ファイル: fasta_test.py プロジェクト: zorrodong/deepvariant
 def test_dispatching_reader(self):
   with fasta.FastaReader(
       test_utils.genomics_core_testdata('test.fasta')) as reader:
     # The reader is an instance of IndexedFastaReader which supports query().
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
   with fasta.FastaReader(
       test_utils.genomics_core_testdata('unindexed.fasta')) as reader:
     # The reader is an instance of UnindexedFastaReader which doesn't support
     # query().
     with self.assertRaises(NotImplementedError):
       reader.query(ranges.make_range('chrM', 1, 5))
コード例 #18
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
 def test_bed_parser(self):
     test_bed_path = test_utils.test_tmpfile(
         'test_bed_parser.bed', '\n'.join([
             'chr20\t61724611\t61725646', 'chr20\t61304163\t61305182',
             'chr20\t61286467\t61286789'
         ]))
     self.assertEqual(list(ranges.bed_parser(test_bed_path)), [
         ranges.make_range('chr20', 61724611, 61725646),
         ranges.make_range('chr20', 61304163, 61305182),
         ranges.make_range('chr20', 61286467, 61286789),
     ])
コード例 #19
0
 def test_bedpe_parser_skips_cross_chr_events(self):
   # pylint: disable=line-too-long
   data = [
       'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
       'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
       'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
   ]
   self.assertEqual(
       list(ranges.parse_lines(data, 'bedpe')), [
           ranges.make_range('chr20', 25972820, 26045538),
           ranges.make_range('chr20', 23719873, 23796523),
       ])
コード例 #20
0
 def setUp(self):
     out_fname = test_utils.test_tmpfile('output.gff')
     self.writer = gff_writer.GffWriter.to_file(out_fname,
                                                gff_pb2.GffHeader(),
                                                gff_pb2.GffWriterOptions())
     self.expected_gff_content = open(
         test_utils.genomics_core_testdata(
             'test_features.gff')).readlines()
     self.header = gff_pb2.GffHeader(
         sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
     self.record = gff_pb2.GffRecord(
         range=ranges.make_range('ctg123', 1000, 1100))
コード例 #21
0
 def test_bed_parser(self):
   data = [
       'chr20\t61724611\t61725646',
       'chr20\t61304163\t61305182',
       'chr20\t61286467\t61286789',
   ]
   self.assertEqual(
       list(ranges.parse_lines(data, 'bed')), [
           ranges.make_range('chr20', 61724611, 61725646),
           ranges.make_range('chr20', 61304163, 61305182),
           ranges.make_range('chr20', 61286467, 61286789),
       ])
コード例 #22
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
コード例 #23
0
  def test_expand_raises_with_missing_contig_in_map(self):
    # Empty contig_map should raise.
    with self.assertRaises(KeyError):
      ranges.expand(ranges.make_range('1', 10, 20), 1, contig_map={})

    # Missing '1' from the contig map should raise.
    with self.assertRaises(KeyError):
      ranges.expand(
          ranges.make_range('1', 10, 20),
          1,
          contig_map={
              '2': reference_pb2.ContigInfo(name='2', n_bases=50),
          })
コード例 #24
0
ファイル: ranges_test.py プロジェクト: palc/deepvariant
 def test_bedpe_parser_skips_cross_chr_events(self):
     # pylint: disable=line-too-long
     data = '\n'.join([
         'chr20\t25763416\t25765517\tchr21\t25825181\t25826882\tP2_PM_20_1549\t63266\t+\tTYPE:DELETION',
         'chr20\t25972820\t25972991\tchr20\t26045347\t26045538\tP2_PM_20_696\t72548\t+\tTYPE:DELETION',
         'chr20\t23719873\t23721974\tchr20\t23794822\t23796523\tP2_PM_20_1548\t76450\t+\tTYPE:DELETION',
     ])
     test_bedpe_path = test_utils.test_tmpfile('test_bedpe_parser2.bedpe',
                                               data)
     self.assertEqual(list(ranges.bedpe_parser(test_bedpe_path)), [
         ranges.make_range('chr20', 25972820, 26045538),
         ranges.make_range('chr20', 23719873, 23796523),
     ])
コード例 #25
0
 def test_variant_position_and_range(self):
   v1 = test_utils.make_variant(chrom='1', alleles=['A', 'C'], start=10)
   v2 = test_utils.make_variant(chrom='1', alleles=['AGCT', 'C'], start=10)
   pos = ranges.make_range('1', 10, 11)
   range_ = ranges.make_range('1', 10, 14)
   v1_range_tuple = ('1', 10, 11)
   v2_range_tuple = ('1', 10, 14)
   self.assertEqual(pos, variant_utils.variant_position(v1))
   self.assertEqual(pos, variant_utils.variant_position(v2))
   self.assertEqual(pos, variant_utils.variant_range(v1))
   self.assertEqual(range_, variant_utils.variant_range(v2))
   self.assertEqual(v1_range_tuple, variant_utils.variant_range_tuple(v1))
   self.assertEqual(v2_range_tuple, variant_utils.variant_range_tuple(v2))
コード例 #26
0
  def test_find_max_overlapping_allows_unordered_search_ranges(self):
    query_range = ranges.make_range('1', 4, 12)
    search_ranges = [
        ranges.make_range('1', 0, 10),
        ranges.make_range('1', 10, 20),
        ranges.make_range('1', 12, 20)
    ]
    max_overlapping_range = search_ranges[0]

    for permutated_ranges in itertools.permutations(search_ranges):
      self.assertEqual(
          permutated_ranges.index(max_overlapping_range),
          ranges.find_max_overlapping(query_range, permutated_ranges))
コード例 #27
0
 def read_span(self):
   if self._read_span is None and self.reads:
     spans = [utils.read_range(r) for r in self.reads]
     self._read_span = ranges.make_range(spans[0].reference_name,
                                         min(s.start for s in spans),
                                         max(s.end for s in spans))
   return self._read_span
コード例 #28
0
ファイル: aligner_test.py プロジェクト: kong75/deepvariant
  def test_no_bad_soft_clipping(self):
    self.skipTest('Enable when b/63143285 global alignment is fixed')
    common = 'CTA'
    read_seq = common + 'GA'
    ref_seq = 'N' + common + 'CA' + 'N'
    alt_seq = 'A' + ref_seq
    targets = [ref_seq, alt_seq]

    region = ranges.make_range('ref', 0, len(ref_seq))
    align_reads = self.make_test_aligner(ref_seq, region)

    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(len(read_seq), 'M')],
        quals=[35] * len(read_seq),
        name='read')
    realigned = align_reads.align_reads(targets, [read])[0]

    # redacted
    # 5M as we'd expect for this read:
    # read_seq: -CTAGA-
    # ref_seq : NCGTCAN
    # But the current algorithm produces a local alignment of the read against
    # the haplotypes, and the G <=> C mismatch causes the local aligner to
    # simply skip those bases instead of incurring the mismatch penalty for it,
    # resulting in a 3M2S read (GA clipped off) instead of the better 5M result.
    self.assertEqual([_cigar.to_cigar_unit(len(read_seq), 'M')],
                     list(realigned.alignment.cigar))
コード例 #29
0
    def test_call_from_allele_counter(self):
        ref = fasta.IndexedFastaReader(testdata.CHR20_FASTA)
        sam_reader = sam.SamReader(testdata.CHR20_BAM)
        size = 1000
        region = ranges.make_range('chr20', 10000000, 10000000 + size)
        allele_counter = _allelecounter.AlleleCounter(
            ref.c_reader, region,
            deepvariant_pb2.AlleleCounterOptions(partition_size=size))
        caller = variant_calling.VariantCaller(
            deepvariant_pb2.VariantCallerOptions(min_count_snps=2,
                                                 min_count_indels=2,
                                                 min_fraction_snps=0.12,
                                                 min_fraction_indels=0.12,
                                                 sample_name='sample_name',
                                                 p_error=0.001,
                                                 max_gq=50,
                                                 gq_resolution=1,
                                                 ploidy=2))

        # Grab all of the reads in our region and add them to the allele_counter.
        reads = list(sam_reader.query(region))
        self.assertNotEmpty(reads)
        for read in reads:
            allele_counter.add(read)

        # Get the candidates records for this whole region.
        candidates = caller.calls_from_allele_counter(allele_counter)

        # We should have at least some candidates and some gvcf records.
        self.assertNotEmpty(candidates)

        # Each candidate should be a DeepVariantCall.
        for candidate in candidates:
            self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
コード例 #30
0
 def setUp(self):
     tfrecord_file = test_utils.genomics_core_testdata(
         'test_features.gff.tfrecord')
     self.records = list(
         io_utils.read_tfrecords(tfrecord_file, proto=gff_pb2.GffRecord))
     self.header = gff_pb2.GffHeader(
         sequence_regions=[ranges.make_range('ctg123', 0, 1497228)])
コード例 #31
0
 def test_overlaps_variant_with_ranges(self):
   variant = variants_pb2.Variant(reference_name='chr2', start=10, end=11)
   range_set = ranges.RangeSet([ranges.make_range('chr1', 0, 5)])
   with mock.patch.object(range_set, 'overlaps') as mock_overlaps:
     mock_overlaps.return_value = True
     self.assertEqual(range_set.variant_overlaps(variant), True)
     mock_overlaps.assert_called_once_with('chr2', 10)
コード例 #32
0
ファイル: fasta_test.py プロジェクト: zyxue/deepvariant
 def test_good_query(self):
     for contig in self.fasta_reader.header.contigs:
         for start in range(contig.n_bases):
             for end in range(start, contig.n_bases):
                 region = ranges.make_range(contig.name, start, end)
                 self.assertEqual(self.in_mem.query(region),
                                  self.fasta_reader.query(region))
コード例 #33
0
ファイル: fasta_test.py プロジェクト: kong75/deepvariant
 def test_good_query(self):
   for contig in self.fasta_reader.header.contigs:
     for start in range(contig.n_bases):
       for end in range(start, contig.n_bases):
         region = ranges.make_range(contig.name, start, end)
         self.assertEqual(
             self.in_mem.query(region), self.fasta_reader.query(region))
コード例 #34
0
  def test_label_variants(self,
                          candidate,
                          expected_confident,
                          expected_truth,
                          expected_label=None,
                          variant_alt_alleles_indices=None):
    if variant_alt_alleles_indices is None:
      variant_alt_alleles_indices = [0]
    labeler = self._make_labeler(
        self.variants,
        ranges.RangeSet(
            [ranges.make_range(self.snp_class1.reference_name, 10, 100)]))

    # Call _match so we can compare our expected truth with the actual one.
    is_confident, truth_variant = labeler._match(candidate)
    self.assertEqual(expected_truth, truth_variant)
    self.assertEqual(is_confident, expected_confident)

    # Now call label_variants to exercise the higher-level API.
    classes_dict = (
        customized_classes_labeler.CustomizedClassesVariantLabel.classes_dict)
    if expected_label is None and expected_truth is not None:
      expected_class_str = expected_truth.info[
          customized_classes_labeler.CustomizedClassesVariantLabel.
          info_field_name
      ].values[0].string_value
      expected_label = classes_dict[expected_class_str]

    labels = list(labeler.label_variants([candidate]))
    self.assertEqual(len(labels), 1)
    self.assertEqual(candidate, labels[0].variant)
    self.assertEqual(expected_confident, labels[0].is_confident)
    self.assertEqual(
        expected_label,
        labels[0].label_for_alt_alleles(variant_alt_alleles_indices))
コード例 #35
0
 def make_labeler_ref(self, candidate_variants, true_variants, bufsize=20):
   all_variants = candidate_variants + true_variants
   contig = all_variants[0].reference_name
   start = min(x.start for x in all_variants)
   end = max(x.end for x in all_variants)
   region = ranges.make_range(contig, start - 1, end + bufsize)
   ref_bases = self._ref_reader.query(region)
   return ReferenceRegion(ref_bases, start=region.start)
コード例 #36
0
  def test_get_reference_bases_good_region(self):
    self.dv_call.variant.start = 10
    region = ranges.make_range(self.variant.reference_name, 8, 13)

    actual = self.pic.get_reference_bases(self.variant)
    self.assertEqual('ACGT', actual)
    self.mock_ref_reader.is_valid.assert_called_once_with(region)
    self.mock_ref_reader.query.assert_called_once_with(region)
コード例 #37
0
ファイル: fasta_test.py プロジェクト: kong75/deepvariant
  def setUpClass(cls):
    cls.fasta_reader = fasta.RefFastaReader(
        test_utils.genomics_core_testdata('test.fasta'))

    cls.in_mem = fasta.InMemoryRefReader(
        [(contig.name, 0,
          cls.fasta_reader.query(
              ranges.make_range(contig.name, 0, contig.n_bases)))
         for contig in cls.fasta_reader.header.contigs])
コード例 #38
0
ファイル: variant_utils.py プロジェクト: kong75/deepvariant
def variant_range(variant):
  """Returns a new Range covering variant.

  Args:
    variant: third_party.nucleus.protos.Variant.

  Returns:
    A new Range with the same reference_name, start, and end as variant.
  """
  return ranges.make_range(variant.reference_name, variant.start, variant.end)
コード例 #39
0
ファイル: fasta_test.py プロジェクト: kong75/deepvariant
  def test_non_zero_start_query(self):
    """Checks all of the ways we can construct an InMemoryRefReader."""
    bases = 'ACGTAACCGGTT'
    for start in range(len(bases)):
      reader = fasta.InMemoryRefReader([('1', start, bases[start:])])
      self.assertEqual(reader.header.contigs[0].name, '1')
      self.assertEqual(reader.header.contigs[0].n_bases, len(bases))

      # Check that our query operation works as expected with a start position.
      for end in range(start, len(bases)):
        self.assertEqual(reader.query(ranges.make_range('1', start, end)),
                         bases[start:end])
コード例 #40
0
ファイル: utils.py プロジェクト: kong75/deepvariant
def read_range(read):
  """Creates a Range proto from the alignment of Read.

  Args:
    read: the read to calculate range

  Returns:
    A third_party.nucleus.protos.Range for read.
  """
  start = read.alignment.position.position
  end = start + cigar.alignment_length(read.alignment.cigar)
  return ranges.make_range(read.alignment.position.reference_name, start, end)
コード例 #41
0
ファイル: utils_test.py プロジェクト: kong75/deepvariant
 def test_read_range(self):
   """Tests reads have their ranges calculated correctly."""
   start = 10000001
   read = test_utils.make_read(
       'AAACAG',
       chrom='chrX',
       start=start,
       cigar='2M1I3M',
       quals=range(10, 16),
       name='read1')
   self.assertEquals(
       ranges.make_range('chrX', start, start + 5), utils.read_range(read))
   read = test_utils.make_read(
       'AAACAG',
       chrom='chrX',
       start=start,
       cigar='2M16D3M',
       quals=range(10, 16),
       name='read1')
   self.assertEquals(
       ranges.make_range('chrX', start, start + 5 + 16),
       utils.read_range(read))
コード例 #42
0
ファイル: variant_utils.py プロジェクト: kong75/deepvariant
def variant_position(variant):
  """Returns a new Range at the start position of variant.

  Args:
    variant: third_party.nucleus.protos.Variant.

  Returns:
    A new Range with the same reference_name as variant and start but an end
    that is start + 1. This produces a range that is the single basepair of the
    start of variant, hence the name position.
  """
  return ranges.make_range(variant.reference_name, variant.start,
                           variant.start + 1)
コード例 #43
0
 def test_wrap(self):
   ref = fasta.RefFastaReader(testdata.CHR20_FASTA)
   sam_reader = sam.SamReader(testdata.CHR20_BAM)
   size = 100
   region = ranges.make_range('chr20', 10000000, 10000000 + size)
   options = deepvariant_pb2.AlleleCounterOptions(partition_size=size)
   allele_counter = _allelecounter.AlleleCounter(ref.get_c_reader(), region,
                                                 options)
   reads = list(sam_reader.query(region))
   self.assertGreater(len(reads), 0)
   for read in reads:
     allele_counter.add(read)
   counts = allele_counter.counts()
   self.assertEqual(len(counts), size)
コード例 #44
0
ファイル: pileup_image.py プロジェクト: kong75/deepvariant
  def get_reads(self, variant):
    """Gets the reads used to construct the pileup image around variant.

    Args:
      variant: A third_party.nucleus.protos.Variant proto
        describing the variant we are creating the pileup image of.

    Returns:
      A list of third_party.nucleus.protos.Read protos.
    """
    query_start = variant.start - self._options.read_overlap_buffer_bp
    query_end = variant.end + self._options.read_overlap_buffer_bp
    region = ranges.make_range(variant.reference_name, query_start, query_end)
    return list(self._sam_reader.query(region))
コード例 #45
0
  def test_wrap(self, fasta_filename):
    chr_names = ['chrM', 'chr1', 'chr2']
    chr_lengths = [100, 76, 121]
    fasta = test_utils.genomics_core_testdata(fasta_filename)
    fai = test_utils.genomics_core_testdata(fasta_filename + '.fai')
    with reference_fai.GenomeReferenceFai.from_file(fasta, fai) as ref:
      self.assertEqual(ref.n_contigs, 3)
      self.assertIn(fasta, ref.fasta_path)
      self.assertIn('GenomeReference backed by htslib FAI index', str(ref))
      self.assertEqual(ref.contig_names, chr_names)
      self.assertEqual(ref.n_bp, sum(chr_lengths))
      self.assertEqual(ref.bases(ranges.make_range('chrM', 1, 10)), 'ATCACAGGT')

      self.assertTrue(ref.is_valid_interval(ranges.make_range('chrM', 1, 10)))
      self.assertFalse(
          ref.is_valid_interval(ranges.make_range('chrM', 1, 100000)))

      self.assertEqual(len(ref.contigs), 3)
      self.assertEqual([c.name for c in ref.contigs], chr_names)
      self.assertEqual([c.n_bases for c in ref.contigs], chr_lengths)
      for contig in ref.contigs:
        self.assertEqual(ref.contig(contig.name), contig)
        self.assertTrue(ref.has_contig(contig.name))
        self.assertFalse(ref.has_contig(contig.name + '.unknown'))
コード例 #46
0
  def test_make_labeler_ref(self, candidates, truths, expected_start,
                            expected_end, bufsize):
    expected_bases = 'A' * (expected_end - expected_start)

    labeler = _make_labeler()
    labeler._ref_reader.query.return_value = expected_bases

    labeler_ref = labeler.make_labeler_ref(candidates, truths, bufsize=bufsize)

    labeler._ref_reader.query.assert_called_once_with(
        ranges.make_range('20', expected_start, expected_end))
    self.assertEqual(labeler_ref.start, expected_start)
    self.assertEqual(labeler_ref.end, expected_end)
    self.assertEqual(
        labeler_ref.bases(expected_start, expected_end), expected_bases)
コード例 #47
0
ファイル: aligner_test.py プロジェクト: kong75/deepvariant
  def test_align_reads_simple(self, read_seq, expected_align_pos,
                              expected_cigar, comment):
    """Test Aligner.align_reads(). Simple tests.

    Targets consist of
      - original reference sequence.
      - a sequence with 'AA' insertion at position 14 and
      -                 'T' deletion at position 19.

    Args:
      read_seq: str, read sequence.
      expected_align_pos: int, expected aligned position
      expected_cigar: [(int, str)], expected cigar information.
      comment: str, test comment.
    """
    ref_seq = 'AAAAAAAAAAAAATGCATGGGGGATTTTTTTTTTT'
    region = ranges.make_range('ref', 10, 10 + len(ref_seq))
    align_reads = self.make_test_aligner(ref_seq, region)
    # redacted
    # implemented. For local alignment, it ensures that there are enough exact
    # matches between the reference and target for end-to-end alignment.
    targets = [ref_seq, 'AAAAAAAAAAAAATAAGCAGGGGGATTTTTTTTTTT']
    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(len(read_seq), 'M')],
        quals=[64] * len(read_seq),
        name='read')
    aligned_reads = align_reads.align_reads(targets, [read])
    self.assertEqual(expected_align_pos,
                     aligned_reads[0].alignment.position.position, comment)
    self.assertEqual(
        _cigar.to_cigar_units(expected_cigar),
        list(aligned_reads[0].alignment.cigar), comment)

    read = test_utils.make_read(
        read_seq,
        chrom='ref',
        start=0,
        cigar=[(2, 'H'), (len(read_seq), 'M'), (1, 'H')],
        quals=[64] * len(read_seq),
        name='read')
    aligned_reads = align_reads.align_reads(targets, [read])
    expected_cigar_w_hard_clip = [(2, 'H')] + expected_cigar + [(1, 'H')]
    self.assertEqual(
        _cigar.to_cigar_units(expected_cigar_w_hard_clip),
        list(aligned_reads[0].alignment.cigar), comment)
コード例 #48
0
def _make_labeler(truth_variants=None, confident_regions=None, **kwargs):
  mock_ref_reader = mock.MagicMock()

  if confident_regions is None:
    # Use the reference of the truth variants if possible, otherwise just use
    # a dummy placeholder value for the contig name and make the confident
    # region a giant span.
    contig = truth_variants[0].reference_name if truth_variants else 'dummy'
    confident_regions = ranges.RangeSet(
        [ranges.make_range(contig, 0, 1000000000)])

  return haplotype_labeler.HaplotypeLabeler(
      truth_vcf_reader=vcf.InMemoryVcfReader(truth_variants or []),
      ref_reader=mock_ref_reader,
      confident_regions=confident_regions,
      **kwargs)
コード例 #49
0
ファイル: pileup_image.py プロジェクト: kong75/deepvariant
  def get_reference_bases(self, variant):
    """Gets the reference bases used to make the pileup image around variant.

    Args:
      variant: A third_party.nucleus.protos.Variant proto
        describing the variant we are creating the pileup image of.

    Returns:
      A string of reference bases or None. Returns None if the reference
      interval for variant isn't valid for some reason.
    """
    start = variant.start - self.half_width
    end = start + self._options.width
    region = ranges.make_range(variant.reference_name, start, end)
    if self._ref_reader.is_valid(region):
      return self._ref_reader.query(region)
    else:
      return None
コード例 #50
0
  def test_match_selects_variant_by_start(self):
    # Tests that match() selects the variant at the same start even if that
    # variant doesn't have the same alleles at candidate and there's an
    # overlapping with the same alleles.
    overlapping = [
        test_utils.make_variant(start=20, alleles=['CC', 'A'], gt=[1, 1]),
        test_utils.make_variant(start=21, alleles=['AAA', 'A'], gt=[0, 1]),
        test_utils.make_variant(start=22, alleles=['AA', 'A'], gt=[1, 1]),
    ]
    candidate = test_utils.make_variant(start=21, alleles=['CC', 'A'])

    labeler = self._make_labeler(
        overlapping,
        ranges.RangeSet(
            [ranges.make_range(overlapping[0].reference_name, 0, 100)]))
    is_confident, truth_variant = labeler._match(candidate)
    self.assertEqual(is_confident, True)
    self.assertEqual(truth_variant, overlapping[1])
コード例 #51
0
ファイル: aligner_test.py プロジェクト: kong75/deepvariant
 def test_sanity_check_readalignment(self, ref_name, ref_start, ref_end,
                                     read_chrom, read_start, read_len,
                                     read_cigar, exception_msg):
   """Test Aligner.sanity_check_readalignment()."""
   region = ranges.make_range(ref_name, ref_start, ref_end)
   ref_seq = 'A' * (ref_end - ref_start)
   align_reads = self.make_test_aligner(ref_seq, region)
   read = test_utils.make_read(
       'A' * read_len,
       chrom=read_chrom,
       start=read_start,
       cigar=read_cigar,
       quals=[64] * read_len,
       name='read')
   if exception_msg:
     with self.assertRaisesRegexp(ValueError, exception_msg):
       align_reads.sanity_check_readalignment(read)
   else:
     align_reads.sanity_check_readalignment(read)
コード例 #52
0
  def test_label_variants(self,
                          candidate,
                          expected_confident,
                          expected_truth,
                          expected_genotype=None):
    labeler = self._make_labeler(
        self.variants,
        ranges.RangeSet([ranges.make_range(self.snp.reference_name, 10, 100)]))

    # Call _match so we can compare our expected truth with the actual one.
    is_confident, truth_variant = labeler._match(candidate)
    self.assertEqual(expected_truth, truth_variant)
    self.assertEqual(is_confident, expected_confident)

    # Now call label_variants to exercise the higher-level API.
    if expected_genotype is None and expected_truth is not None:
      expected_genotype = tuple(expected_truth.calls[0].genotype)
    labels = list(labeler.label_variants([candidate]))
    self.assertEqual(len(labels), 1)
    self.assertEqual(candidate, labels[0].variant)
    self.assertEqual(expected_confident, labels[0].is_confident)
    self.assertEqual(expected_genotype, labels[0].genotype)
コード例 #53
0
def _create_record_from_template(template, start, end, fasta_reader):
  """Returns a copy of the template variant with the new start and end.

  Updates to the start position cause a different reference base to be set.

  Args:
    template: third_party.nucleus.protos.Variant. The template variant whose
      non-location and reference base information to use.
    start: int. The desired new start location.
    end: int. The desired new end location.
    fasta_reader: GenomeReferenceFai object. The reader used to determine the
      correct start base to use for the updated variant.

  Returns:
    An updated third_party.nucleus.protos.Variant with the proper start, end,
    and reference base set and all other fields inherited from the template.
  """
  retval = copy.deepcopy(template)
  retval.start = start
  retval.end = end
  if start != template.start:
    retval.reference_bases = fasta_reader.query(
        ranges.make_range(retval.reference_name, start, start + 1))
  return retval
コード例 #54
0
  def test_call_from_allele_counter(self):
    ref = fasta.RefFastaReader(testdata.CHR20_FASTA)
    sam_reader = sam.SamReader(testdata.CHR20_BAM)
    size = 1000
    region = ranges.make_range('chr20', 10000000, 10000000 + size)
    allele_counter = _allelecounter.AlleleCounter(
        ref.get_c_reader(),
        region,
        deepvariant_pb2.AlleleCounterOptions(partition_size=size))
    caller = variant_calling.VariantCaller(
        deepvariant_pb2.VariantCallerOptions(
            min_count_snps=2,
            min_count_indels=2,
            min_fraction_snps=0.12,
            min_fraction_indels=0.12,
            sample_name='sample_name',
            p_error=0.001,
            max_gq=50,
            gq_resolution=1,
            ploidy=2))

    # Grab all of the reads in our region and add them to the allele_counter.
    reads = list(sam_reader.query(region))
    self.assertNotEmpty(reads)
    for read in reads:
      allele_counter.add(read)

    # Get the candidates records for this whole region.
    candidates = caller.calls_from_allele_counter(allele_counter)

    # We should have at least some candidates and some gvcf records.
    self.assertNotEmpty(candidates)

    # Each candidate should be a DeepVariantCall.
    for candidate in candidates:
      self.assertIsInstance(candidate, deepvariant_pb2.DeepVariantCall)
コード例 #55
0
ファイル: fasta_test.py プロジェクト: kong75/deepvariant
 def test_make_ref_reader_default(self, fasta_filename):
   fasta_path = test_utils.genomics_core_testdata(fasta_filename)
   with fasta.RefFastaReader(fasta_path) as reader:
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 6)), 'ATCAC')
コード例 #56
0
 def bases(self, start, end):
   return self.query(ranges.make_range(self._DUMMY_CHROM_NAME, start, end))
コード例 #57
0
ファイル: fasta_test.py プロジェクト: kong75/deepvariant
 def test_bad_query_with_start(self, start, end):
   reader = fasta.InMemoryRefReader([('1', 10, 'ACGT')])
   with self.assertRaises(ValueError):
     reader.query(ranges.make_range('1', start, end))
コード例 #58
0
ファイル: fasta_test.py プロジェクト: kong75/deepvariant
 def test_make_ref_reader_cache_specified(self, fasta_filename):
   fasta_path = test_utils.genomics_core_testdata(fasta_filename)
   with fasta.RefFastaReader(fasta_path, cache_size=10) as reader:
     self.assertEqual(reader.query(ranges.make_range('chrM', 1, 5)), 'ATCA')
コード例 #59
0
ファイル: aligner_test.py プロジェクト: kong75/deepvariant
 def make_test_aligner(self, ref_seq=None, region=None):
   config = realigner_pb2.RealignerOptions.AlignerOptions(
       match=1, mismatch=1, gap_open=2, gap_extend=1, k=3, error_rate=.02)
   ref_seq = ref_seq or 'AAAAAAA'
   region = region or ranges.make_range('ref', 10, 10 + len(ref_seq))
   return aligner.Aligner(config, region, ref_seq)